quiter logs please
[objavi2.git] / objavi / fmbook.py
blob19bfd6a04bea87a104104418e5b672e78f2e874b
1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
21 PDF"""
23 import os, sys
24 import tempfile
25 import re, time
26 import random
27 from subprocess import Popen, check_call, PIPE
28 from cStringIO import StringIO
29 from urllib2 import urlopen, HTTPError
30 import zipfile
31 import traceback
32 from string import ascii_letters
33 from pprint import pformat
35 try:
36 import simplejson as json
37 except ImportError:
38 import json
40 import lxml.html
41 from lxml import etree
43 from objavi import config, epub_utils
44 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
45 from objavi.book_utils import ObjaviError, log_types
46 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline, parse_extracted_outline
47 from objavi.epub import add_guts, _find_tag
48 from objavi.xhtml_utils import EpubChapter, split_tree
49 from objavi.cgi_utils import url2path
51 from iarchive import epub as ia_epub
52 from booki.bookizip import get_metadata, add_metadata
54 TMPDIR = os.path.abspath(config.TMPDIR)
55 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
56 HTTP_HOST = os.environ.get('HTTP_HOST', '')
58 def find_archive_urls(bookid, bookname):
59 s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
60 detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
61 return (s3url, detailsurl)
63 def _get_best_title(tocpoint):
64 if 'html_title' in tocpoint:
65 return tocpoint['html_title']
66 if 'title' in tocpoint:
67 return tocpoint['title']
68 return 'Untitled'
71 def _add_initial_number(e, n):
72 """Put a styled chapter number n at the beginning of element e."""
73 initial = e.makeelement("strong", Class="initial")
74 e.insert(0, initial)
75 initial.tail = ' '
76 if e.text is not None:
77 initial.tail += e.text
78 e.text = ''
79 initial.text = "%s." % n
81 def expand_toc(toc, depth=1, index=0):
82 """Reformat toc slightly for convenience"""
83 for item in toc:
84 url = item['url'].lstrip('/')
85 bits = url.split('#', 1)
86 filename = bits[0]
87 fragment = (bits[1] if len(bits) == 2 else None)
88 item['depth'] = depth
89 item["filename"] = filename
90 item["fragment"] = fragment
91 item["index"] = index
92 index += 1
93 if 'children' in item:
94 index = expand_toc(item['children'], depth + 1, index)
95 return index
97 def _serialise(rtoc, stoc, depth):
98 for item in rtoc:
99 url = item['url'].lstrip('/')
100 bits = url.split('#', 1)
101 filename = bits[0]
102 fragment = (bits[1] if len(bits) == 2 else None)
103 stoc.append({"depth": depth,
104 "title": item['title'],
105 "url": url,
106 "filename": filename,
107 "fragment": fragment,
108 "type": item['type']
110 if 'children' in item:
111 _serialise(item['children'], stoc, depth + 1)
114 def serialise_toc(rtoc):
115 """Take the recursive TOC structure and turn it into a list of
116 serial points. Reformat some things for convenience."""
117 stoc = []
118 _serialise(rtoc, stoc, 1)
119 for i, x in enumerate(stoc):
120 x['position'] = i
121 return stoc
123 def filename_toc_map(rtoc):
124 tocmap = {}
125 #log(rtoc)
126 def traverse(toc):
127 for point in toc:
128 #log(point.keys())
129 tocmap.setdefault(point['filename'], []).append(point)
130 if 'children' in point:
131 traverse(point['children'])
132 traverse(rtoc)
133 return tocmap
135 def save_data(fn, data):
136 """Save without tripping up on unicode"""
137 if isinstance(data, unicode):
138 data = data.encode('utf8', 'ignore')
139 f = open(fn, 'w')
140 f.write(data)
141 f.close()
144 class Book(object):
145 page_numbers = 'latin'
146 preamble_page_numbers = 'roman'
148 def notify_watcher(self, message=None):
149 if self.watchers:
150 if message is None:
151 #message is the name of the caller
152 message = traceback.extract_stack(None, 2)[0][2]
153 log("notify_watcher called with '%s'" % message)
154 for w in self.watchers:
155 w(message)
157 def __enter__(self):
158 return self
160 def __exit__(self, exc_type, exc_value, tb):
161 self.notify_watcher(config.FINISHED_MESSAGE)
162 self.cleanup()
163 #could deal with exceptions here and return true
166 def __init__(self, book, server, bookname,
167 page_settings=None, watchers=None, isbn=None,
168 license=config.DEFAULT_LICENSE, title=None,
169 max_age=0):
170 log("*** Starting new book %s ***" % bookname)
171 self.watchers = set()
172 if watchers is not None:
173 self.watchers.update(watchers)
174 self.notify_watcher('start')
175 self.bookname = bookname
176 self.book = book
177 self.server = server
178 self.cookie = ''.join(random.sample(ascii_letters, 10))
179 try:
180 blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
181 except HTTPError, e:
182 traceback.print_exc()
183 self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
184 #not much to do?
185 #raise 502 Bad Gateway ?
186 sys.exit()
187 f = StringIO(blob)
188 self.notify_watcher('fetch_zip')
189 self.store = zipfile.ZipFile(f, 'r')
190 self.info = json.loads(self.store.read('info.json'))
191 for k in ('manifest', 'metadata', 'spine', 'TOC'):
192 if k not in self.info:
193 raise ObjaviError('info.json of %s lacks vital element "%s"' %
194 (bookname, k))
195 #check types also?
197 self.metadata = self.info['metadata']
198 self.spine = self.info['spine']
199 self.manifest = self.info['manifest']
201 if server == config.LOCALHOST: # [DEPRECATED]
202 server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
203 book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
205 log(pformat(self.metadata))
206 self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
207 if not self.lang:
208 self.lang = guess_lang(server, book)
209 log('guessed lang as %s' % self.lang)
211 self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
212 if not self.toc_header:
213 self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
215 self.dir = str(get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0])
216 if not self.dir:
217 self.dir = guess_text_dir(server, book)
219 #Patch in the extra metadata. (lang and dir may be set from config)
220 #these should be read from zip -- so should go into zip?
221 for var, key, scheme, ns in (
222 (isbn, 'id', 'ISBN', config.DC),
223 (license, 'rights', 'License', config.DC),
224 (title, 'title', '', config.DC),
225 (self.lang, 'language', '', config.DC),
226 (self.dir, 'dir', '', config.FM),
228 if var is not None:
229 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
231 self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
232 self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
234 self.toc = self.info['TOC']
235 expand_toc(self.toc)
237 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
238 os.chmod(self.workdir, 0755)
240 self.body_html_file = self.filepath('body.html')
241 self.body_pdf_file = self.filepath('body.pdf')
242 self.preamble_html_file = self.filepath('preamble.html')
243 self.preamble_pdf_file = self.filepath('preamble.pdf')
244 self.tail_html_file = self.filepath('tail.html')
245 self.tail_pdf_file = self.filepath('tail.pdf')
246 self.isbn_pdf_file = None
247 self.pdf_file = self.filepath('final.pdf')
248 self.body_odt_file = self.filepath('body.odt')
249 self.outline_file = self.filepath('outline.txt')
251 self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
253 if page_settings is not None:
254 self.maker = PageSettings(**page_settings)
256 if title is not None:
257 self.title = title
258 else:
259 titles = get_metadata(self.metadata, 'title')
260 if titles:
261 self.title = titles[0]
262 else:
263 self.title = 'A Book About ' + self.book
265 self.notify_watcher()
268 if config.TRY_BOOK_CLEANUP_ON_DEL:
269 #Dont even define __del__ if it is not used.
270 _try_cleanup_on_del = True
271 def __del__(self):
272 if self._try_cleanup_on_del and os.path.exists(self.workdir):
273 self._try_cleanup_on_del = False #or else you can get in bad cycles
274 self.cleanup()
276 def get_tree_by_id(self, id):
277 """get an HTML tree from the given manifest ID"""
278 name = self.manifest[id]['url']
279 mimetype = self.manifest[id]['mimetype']
280 s = self.store.read(name)
281 f = StringIO(s)
282 if mimetype == 'text/html':
283 try:
284 tree = lxml.html.parse(f)
285 except etree.XMLSyntaxError, e:
286 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
287 (id, name, s[:20], e))
288 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
289 elif 'xml' in mimetype: #XXX or is this just asking for trouble?
290 tree = etree.parse(f)
291 else:
292 tree = f.read()
293 f.close()
294 return tree
296 def filepath(self, fn):
297 return os.path.join(self.workdir, fn)
299 def save_tempfile(self, fn, data):
300 """Save the data in a temporary directory that will be cleaned
301 up when all is done. Return the absolute file path."""
302 fn = self.filepath(fn)
303 save_data(fn, data)
304 return fn
306 def make_oo_doc(self):
307 """Make an openoffice document, using the html2odt script."""
308 self.wait_for_xvfb()
309 html_text = etree.tostring(self.tree, method="html")
310 save_data(self.body_html_file, html_text)
311 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
312 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
313 os.rename(self.body_odt_file, self.publish_file)
314 self.notify_watcher()
316 def extract_pdf_outline(self):
317 """Get the outline (table of contents) for the PDF, which
318 wkhtmltopdf should have written to a file. If that file
319 doesn't exist (or config says not to use it), fall back to
320 using self._extract_pdf_outline_the_old_way, below.
322 if config.USE_DUMP_OUTLINE:
323 try:
324 self.outline_contents, number_of_pages = \
325 parse_extracted_outline(self.outline_file)
327 except Exception, e:
328 traceback.print_exc()
329 number_of_pages = self._extract_pdf_outline_the_old_way()
330 else:
331 number_of_pages = self._extract_pdf_outline_the_old_way()
333 self.notify_watcher()
334 return number_of_pages
336 def _extract_pdf_outline_the_old_way(self):
337 """Try to get the PDF outline using pdftk. This doesn't work
338 well with all scripts."""
339 debugf = self.filepath('extracted-outline.txt')
340 self.outline_contents, number_of_pages = \
341 parse_outline(self.body_pdf_file, 1, debugf)
343 if not self.outline_contents:
344 #probably problems with international text. need a horrible hack
345 log('no outline: trying again with ascii headings')
346 import copy
347 tree = copy.deepcopy(self.tree)
348 titlemap = {}
349 for tag in ('h1', 'h2', 'h3', 'h4'):
350 for i, e in enumerate(tree.getiterator(tag)):
351 key = "%s_%s" % (tag, i)
352 titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
353 del e[:]
354 if tag == 'h1':
355 e = lxml.etree.SubElement(e, "strong", Class="initial")
356 e.text = key
357 log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
359 ascii_html_file = self.filepath('body-ascii-headings.html')
360 ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
361 html_text = lxml.etree.tostring(tree, method="html")
362 save_data(ascii_html_file, html_text)
363 self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
364 debugf = self.filepath('ascii-extracted-outline.txt')
365 ascii_contents, number_of_ascii_pages = \
366 parse_outline(ascii_pdf_file, 1, debugf)
367 self.outline_contents = []
368 log ("number of pages: %s, post ascii: %s" %
369 (number_of_pages, number_of_ascii_pages))
370 for ascii_title, depth, pageno in ascii_contents:
371 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
372 ascii_title = ascii_title[:-4]
373 if ' ' in ascii_title:
374 ascii_title = ascii_title.rsplit(' ', 1)[1]
375 title = titlemap.get(ascii_title, '')
376 log((ascii_title, title, depth, pageno))
378 self.outline_contents.append((title, depth, pageno))
380 return number_of_pages
382 def make_body_pdf(self):
383 """Make a pdf of the HTML, using webkit"""
384 #1. Save the html
385 html_text = etree.tostring(self.tree, method="html")
386 save_data(self.body_html_file, html_text)
388 #2. Make a pdf of it
389 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True, outline_file=self.outline_file)
390 self.notify_watcher('generate_pdf')
392 n_pages = self.extract_pdf_outline()
394 log ("found %s pages in pdf" % n_pages)
395 #4. resize pages, shift gutters, even pages
396 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
397 self.notify_watcher('reshape_pdf')
399 #5 add page numbers
400 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
401 numbers=self.page_numbers)
402 self.notify_watcher("number_pdf")
403 self.notify_watcher()
405 def make_preamble_pdf(self):
406 contents = self.make_contents()
407 inside_cover_html = self.compose_inside_cover()
408 log_types(self.dir, self.css_url, self.title, inside_cover_html,
409 self.toc_header, contents, self.title)
411 html = ('<html dir="%s"><head>\n'
412 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
413 '<link rel="stylesheet" href="%s" />\n'
414 '</head>\n<body>\n'
415 '<h1 class="frontpage">%s</h1>'
416 '%s\n'
417 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
418 '<div style="page-break-after: always; color:#fff" class="unseen">.'
419 '<!--%s--></div></body></html>'
420 ) % (self.dir, self.css_url, self.title, inside_cover_html,
421 self.toc_header, contents, self.title)
422 save_data(self.preamble_html_file, html)
424 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
426 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
428 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
429 numbers=self.preamble_page_numbers,
430 number_start=-2)
432 self.notify_watcher()
434 def make_end_matter_pdf(self):
435 """Make an inside back cover and a back cover. If there is an
436 isbn number its barcode will be put on the back cover."""
437 if self.isbn:
438 self.isbn_pdf_file = self.filepath('isbn.pdf')
439 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
440 self.notify_watcher('make_barcode_pdf')
442 end_matter = self.compose_end_matter()
443 #log(end_matter)
444 save_data(self.tail_html_file, end_matter.decode('utf-8'))
445 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
447 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
448 centre_end=True, even_pages=False)
449 self.notify_watcher()
451 def make_book_pdf(self):
452 """A convenient wrapper of a few necessary steps"""
453 # now the Xvfb server is needed. make sure it has had long enough to get going
454 self.wait_for_xvfb()
455 self.make_body_pdf()
456 self.make_preamble_pdf()
457 self.make_end_matter_pdf()
459 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
460 self.body_pdf_file, self.tail_pdf_file,
461 self.isbn_pdf_file)
463 self.notify_watcher('concatenated_pdfs')
466 def make_simple_pdf(self, mode):
467 """Make a simple pdf document without contents or separate
468 title page. This is used for multicolumn newspapers and for
469 web-destined pdfs."""
470 self.wait_for_xvfb()
471 #0. Add heading to begining of html
472 body = list(self.tree.cssselect('body'))[0]
473 e = body.makeelement('h1', {'id': 'book-title'})
474 e.text = self.title
475 body.insert(0, e)
476 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
477 e.addnext(intro)
479 #0.5 adjust parameters to suit the particular kind of output
480 if mode == 'web':
481 self.maker.gutter = 0
483 #1. Save the html
484 html_text = etree.tostring(self.tree, method="html")
485 save_data(self.body_html_file, html_text)
487 #2. Make a pdf of it (direct to to final pdf)
488 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True, outline_file=self.outline_file)
489 self.notify_watcher('generate_pdf')
490 n_pages = count_pdf_pages(self.pdf_file)
492 if mode != 'web':
493 #3. resize pages and shift gutters.
494 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
495 self.notify_watcher('reshape_pdf')
497 #4. add page numbers
498 self.maker.number_pdf(self.pdf_file, n_pages,
499 dir=self.dir, numbers=self.page_numbers)
500 self.notify_watcher("number_pdf")
501 self.notify_watcher()
504 def rotate180(self):
505 """Rotate the pdf 180 degrees so an RTL book can print on LTR
506 presses."""
507 rotated = self.filepath('final-rotate.pdf')
508 unrotated = self.filepath('final-pre-rotate.pdf')
509 #leave the unrotated pdf intact at first, in case of error.
510 rotate_pdf(self.pdf_file, rotated)
511 os.rename(self.pdf_file, unrotated)
512 os.rename(rotated, self.pdf_file)
513 self.notify_watcher()
515 def publish_pdf(self):
516 """Move the finished PDF to its final resting place"""
517 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
518 os.rename(self.pdf_file, self.publish_file)
519 self.notify_watcher()
521 def publish_bookizip(self):
522 """Publish the bookizip. For this, copy rather than move,
523 because the bookizip might be used by further processing. If
524 possible, a hard link is created."""
525 log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
526 try:
527 run(['cp', '-l', self.bookizip_file, self.publish_file])
528 except OSError:
529 run(['cp', self.bookizip_file, self.publish_file])
530 self.notify_watcher()
532 def concat_html(self):
533 """Join all the chapters together into one tree. Keep the TOC
534 up-to-date along the way."""
536 #each manifest item looks like:
537 #{'contributors': []
538 #'license': [],
539 #'mimetype': '',
540 #'rightsholders': []
541 #'url': ''}
542 doc = lxml.html.document_fromstring('<html><body></body></html>')
543 tocmap = filename_toc_map(self.toc)
544 for ID in self.spine:
545 details = self.manifest[ID]
546 #log(ID, pformat(details))
547 # ACO MIJENJAO
548 try:
549 root = self.get_tree_by_id(ID).getroot()
550 except Exception, e:
551 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
552 continue
553 #handle any TOC points in this file
554 for point in tocmap[details['url']]:
555 #if the url has a #identifier, use it. Otherwise, make
556 #one up, using a hidden element at the beginning of
557 #the inserted document.
558 #XXX this will break if different files use the same ids
559 #XXX should either replace all, or replace selectively.
560 if point['fragment']:
561 fragment = point['fragment']
562 else:
563 body = _find_tag(root, 'body')
564 fragment = '%s_%s' % (self.cookie, point['index'])
565 #reuse first tag if it is suitable.
566 if (len(body) and
567 body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
568 if body[0].get('id') is None:
569 body[0].set('id', fragment)
570 else:
571 fragment = body[0].get('id')
572 #the chapter starts with a heading. that heading should be the chapter name.
573 if body[0].tag in ('h1', 'h2', 'h3'):
574 log('chapter has title "%s", found html title "%s"' %
575 (point['title'], body[0].text_content()))
576 point['html_title'] = body[0].text_content()
577 else:
578 marker = body.makeelement('div', style="display:none",
579 id=fragment)
580 body.insert(0, marker)
581 point['html_id'] = fragment
583 add_guts(root, doc)
584 return doc
586 def unpack_static(self):
587 """Extract static files from the zip for the html to refer to."""
588 static_files = [x['url'] for x in self.manifest.values()
589 if x['url'].startswith('static')]
590 if static_files:
591 os.mkdir(self.filepath('static'))
593 for name in static_files:
594 s = self.store.read(name)
595 f = open(self.filepath(name), 'w')
596 f.write(s)
597 f.close()
598 self.notify_watcher()
600 def load_book(self):
601 """"""
602 #XXX concatenate the HTML to match how TWiki version worked.
603 # This is perhaps foolishly early -- throwing away useful boundaries.
604 self.unpack_static()
605 self.tree = self.concat_html()
606 self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
608 self.headings = [x for x in self.tree.cssselect('h1')]
609 if self.headings:
610 self.headings[0].set('class', "first-heading")
611 for h1 in self.headings:
612 h1.title = h1.text_content().strip()
613 self.notify_watcher()
615 def make_contents(self):
616 """Generate HTML containing the table of contents. This can
617 only be done after the main PDF has been made, because the
618 page numbers are contained in the PDF outline."""
619 header = '<table class="toc">\n'
620 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
621 '<td class="pagenumber">%s</td></tr>\n')
622 empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
623 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
624 footer = '\n</table>'
626 contents = []
628 chapter = 1
629 page_num = 1
630 #log(self.outline_contents)
631 outline_contents = iter(self.outline_contents)
633 for section in self.toc:
634 if not section.get('children'):
635 contents.append(empty_section_tmpl % section['title'])
636 continue
637 contents.append(section_tmpl % section['title'])
639 for point in section['children']:
640 try:
641 level = 99
642 while level > 1:
643 h1_text, level, page_num = outline_contents.next()
644 except StopIteration:
645 log("contents data not found for %s. Stopping" % (point,))
646 break
647 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
648 chapter += 1
650 doc = header + '\n'.join(contents) + footer
651 if isinstance(doc, unicode):
652 doc = doc.encode('utf-8')
653 self.notify_watcher()
654 return doc
656 def add_section_titles(self):
657 """Add any section heading pages that the TOC.txt file
658 specifies. These are sub-book, super-chapter groupings.
660 Also add initial numbers to chapters.
662 chapter = 1
663 section = None
664 #log(self.toc)
665 for t in self.toc:
666 #only top level sections get a subsection page,
667 #and only if they have children.
668 if t.get('children'):
669 section = self.tree.makeelement('div', Class="objavi-subsection")
670 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
671 heading.text = t['title']
672 for child in t['children']:
673 item = etree.SubElement(section, 'div', Class="objavi-chapter")
674 if 'html_title' in child:
675 item.text = child['html_title']
676 heading = self.tree.cssselect('#'+ child['html_id'])
677 if heading:
678 _add_initial_number(heading[0], chapter)
679 else:
680 item.text = child['title']
681 _add_initial_number(item, chapter)
682 log(item.text, debug='HTMLGEN')
683 chapter += 1
684 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
685 location = self.tree.cssselect('#'+ t['html_id'])[0]
686 location.addprevious(section)
689 self.notify_watcher()
692 def add_css(self, css=None, mode='book'):
693 """If css looks like a url, use it as a stylesheet link.
694 Otherwise it is the CSS itself, which is saved to a temporary file
695 and linked to."""
696 log("css is %r" % css)
697 htmltree = self.tree
698 if css is None or not css.strip():
699 css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
700 if css_default is None:
701 #guess from language -- this should come first
702 css_modes = config.LANGUAGE_CSS.get(self.lang,
703 config.LANGUAGE_CSS['en'])
704 css_default = css_modes.get(mode, css_modes[None])
705 url = 'file://' + os.path.abspath(url2path(css_default))
706 elif not re.match(r'^http://\S+$', css):
707 fn = self.save_tempfile('objavi.css', css)
708 url = 'file://' + fn
709 else:
710 url = css
712 #find the head -- it's probably first child but lets not assume.
713 for child in htmltree:
714 if child.tag == 'head':
715 head = child
716 break
717 else:
718 head = htmltree.makeelement('head')
719 htmltree.insert(0, head)
721 link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
722 self.css_url = url
723 self.notify_watcher()
724 return url
727 def _read_localised_template(self, template, fallbacks=['en']):
728 """Try to get the template in the approriate language, otherwise in english."""
729 for lang in [self.lang] + fallbacks:
730 try:
731 fn = template % (lang)
732 f = open(fn)
733 break
734 except IOError, e:
735 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
736 log(e)
737 template = f.read()
738 f.close()
739 return template
741 def compose_inside_cover(self):
742 """create the markup for the preamble inside cover."""
743 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
745 if self.isbn:
746 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
747 else:
748 isbn_text = ''
750 return template % {'date': time.strftime('%Y-%m-%d'),
751 'isbn': isbn_text,
752 'license': self.license,
756 def compose_end_matter(self):
757 """create the markup for the end_matter inside cover. If
758 self.isbn is not set, the html will result in a pdf that
759 spills onto two pages.
761 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
763 d = {'css_url': self.css_url,
764 'title': self.title
767 if self.isbn:
768 d['inside_cover_style'] = ''
769 else:
770 d['inside_cover_style'] = 'page-break-after: always'
772 return template % d
775 def make_epub(self, use_cache=False):
776 """Make an epub version of the book, using Mike McCabe's
777 epub module for the Internet Archive."""
778 ebook = ia_epub.Book(self.publish_file, content_dir='')
779 def add_file(ID, filename, mediatype, content):
780 ebook.add_content({'media-type': mediatype.encode('utf-8'),
781 'id': ID.encode('utf-8'),
782 'href': filename.encode('utf-8'),
783 }, content)
785 toc = self.info['TOC']
787 #manifest
788 filemap = {} #map html to corresponding xhtml
789 spinemap = {} #map IDs to multi-file chapters
790 for ID in self.manifest:
791 details = self.manifest[ID]
792 #log(ID, pformat(details))
793 fn, mediatype = details['url'], details['mimetype']
794 content = self.store.read(fn)
795 if mediatype == 'text/html':
796 #convert to application/xhtml+xml, and perhaps split
797 c = EpubChapter(self.server, self.book, ID, content,
798 use_cache=use_cache)
799 c.remove_bad_tags()
800 if fn[-5:] == '.html':
801 fnbase = fn[:-5]
802 else:
803 fnbase = fn
804 fnx = fnbase + '.xhtml'
805 mediatype = 'application/xhtml+xml'
807 fragments = split_html(c.as_xhtml(),
808 compressed_size=self.store.getinfo(fn).compress_size)
810 #add the first one as if it is the whole thing (as it often is)
811 add_file(ID, fnx, mediatype, fragments[0])
812 filemap[fn] = fnx
813 if len(fragments) > 1:
814 spine_ids = [ID]
815 spinemap[ID] = spine_ids
816 #add any extras
817 for i in range(1, len(fragments)):
818 # XXX it is possible for duplicates if another
819 # file happens to have this name. Ignore for now
820 _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
821 spine_ids.append(_id)
822 add_file(_id,
823 '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
824 mediatype, fragments[i])
826 else:
827 add_file(ID, fn, mediatype, content)
829 #toc
830 ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
831 ebook.add(ebook.content_dir + 'toc.ncx', ncx)
833 #spine
834 for ID in self.spine:
835 if ID in spinemap:
836 for x in spinemap[ID]:
837 ebook.add_spine_item({'idref': x})
838 else:
839 ebook.add_spine_item({'idref': ID})
841 #metadata -- no use of attributes (yet)
842 # and fm: metadata disappears for now
843 DCNS = config.DCNS
844 DC = config.DC
845 meta_info_items = []
846 for ns, namespace in self.metadata.items():
847 for keyword, schemes in namespace.items():
848 if ns:
849 keyword = '{%s}%s' % (ns, keyword)
850 for scheme, values in schemes.items():
851 for value in values:
852 item = {
853 'item': keyword,
854 'text': value,
856 if scheme:
857 if keyword in (DCNS + 'creator', DCNS + 'contributor'):
858 item['atts'] = {'role': scheme}
859 else:
860 item['atts'] = {'scheme': scheme}
862 has_authors = 'creator' in self.metadata[DC]
863 if not has_authors and config.CLAIM_UNAUTHORED:
864 authors = []
865 for x in self.metadata[DC]['creator'].values():
866 authors.extend(x)
868 meta_info_items.append({'item': DCNS + 'creator',
869 'text': 'The Contributors'})
871 meta_info_items.append({'item': DCNS + 'rights',
872 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
875 tree_str = ia_epub.make_opf(meta_info_items,
876 ebook.manifest_items,
877 ebook.spine_items,
878 ebook.guide_items,
879 ebook.cover_id)
880 ebook.add(ebook.content_dir + 'content.opf', tree_str)
881 ebook.z.close()
882 self.notify_watcher()
885 def publish_s3(self):
886 """Push the book's epub to archive.org, using S3."""
887 #XXX why only epub?
888 secrets = {}
889 for x in ('S3_SECRET', 'S3_ACCESSKEY'):
890 fn = getattr(config, x)
891 f = open(fn)
892 secrets[x] = f.read().strip()
893 f.close()
895 now = time.strftime('%F')
896 s3output = self.filepath('s3-output.txt')
897 s3url, detailsurl = find_archive_urls(self.book, self.bookname)
898 headers = [
899 'x-amz-auto-make-bucket:1',
900 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
901 'x-archive-meta-mediatype:texts',
902 'x-archive-meta-collection:opensource',
903 'x-archive-meta-title:%s' % (self.book,),
904 'x-archive-meta-date:%s' % (now,),
905 'x-archive-meta-creator:FLOSS Manuals Contributors',
908 if self.license in config.LICENSES:
909 headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
911 argv = ['curl', '--location', '-s', '-o', s3output]
912 for h in headers:
913 argv.extend(('--header', h))
914 argv.extend(('--upload-file', self.publish_file, s3url,))
916 log(' '.join(repr(x) for x in argv))
917 check_call(argv, stdout=sys.stderr)
918 self.notify_watcher()
919 return detailsurl, s3url
922 def spawn_x(self):
923 """Start an Xvfb instance, using a new server number. A
924 reference to it is stored in self.xvfb, which is used to kill
925 it when the pdf is done.
927 Note that Xvfb doesn't interact well with dbus which is
928 present on modern desktops.
930 #Find an unused server number (in case two cgis are running at once)
931 while True:
932 servernum = random.randrange(50, 500)
933 if not os.path.exists('/tmp/.X%s-lock' % servernum):
934 break
936 self.xserver_no = ':%s' % servernum
938 authfile = self.filepath('Xauthority')
939 os.environ['XAUTHORITY'] = authfile
941 #mcookie(1) eats into /dev/random, so avoid that
942 from hashlib import md5
943 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
944 mcookie = m.hexdigest()
946 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
948 self.xvfb = Popen(['Xvfb', self.xserver_no,
949 '-screen', '0', '1024x768x24',
950 '-pixdepths', '32',
951 #'-blackpixel', '0',
952 #'-whitepixel', str(2 ** 24 -1),
953 #'+extension', 'Composite',
954 '-dpi', '96',
955 #'-kb',
956 '-nolisten', 'tcp',
959 # We need to wait a bit before the Xvfb is ready. but the
960 # downloads are so slow that that probably doesn't matter
962 self.xvfb_ready_time = time.time() + 2
964 os.environ['DISPLAY'] = self.xserver_no
965 log(self.xserver_no)
967 def wait_for_xvfb(self):
968 """wait until a previously set time before continuing. This
969 is so Xvfb has time to properly start."""
970 if hasattr(self, 'xvfb'):
971 d = self.xvfb_ready_time - time.time()
972 if d > 0:
973 time.sleep(d)
974 self.notify_watcher()
976 def cleanup_x(self):
977 """Try very hard to kill off Xvfb. In addition to killing
978 this instance's xvfb, occasionally (randomly) search for
979 escaped Xvfb instances and kill those too."""
980 if not hasattr(self, 'xvfb'):
981 return
982 check_call(['xauth', 'remove', self.xserver_no])
983 p = self.xvfb
984 log("trying to kill Xvfb %s" % p.pid)
985 os.kill(p.pid, 15)
986 for i in range(10):
987 if p.poll() is not None:
988 log("%s died with %s" % (p.pid, p.poll()))
989 break
990 log("%s not dead yet" % p.pid)
991 time.sleep(0.2)
992 else:
993 log("Xvfb would not die! kill -9! kill -9!")
994 try:
995 os.kill(p.pid, 9)
996 except OSError, e:
997 log(e)
999 if random.random() < 0.1:
1000 # occasionally kill old xvfbs and soffices, if there are any.
1001 self.kill_old_processes()
1003 def kill_old_processes(self):
1004 """Sometimes, despite everything, Xvfb or soffice instances
1005 hang around well after they are wanted -- for example if the
1006 cgi process dies particularly badly. So kill them if they have
1007 been running for a long time."""
1008 log("running kill_old_processes")
1009 killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1010 os.path.basename(config.HTML2ODT),
1011 os.path.basename(config.WKHTMLTOPDF),
1013 p = Popen(['ps', '-C', killable_names,
1014 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
1015 data = p.communicate()[0].strip()
1016 if data:
1017 lines = data.split('\n')
1018 pids = []
1019 for line in lines:
1020 log('dealing with ps output "%s"' % line)
1021 try:
1022 pid, days, hours, minutes, seconds \
1023 = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1024 except AttributeError:
1025 log("Couldn't parse that line!")
1026 # 50 minutes should be enough xvfb time for anyone
1027 if days or hours or int(minutes) > 50:
1028 pid = int(pid)
1029 log("going to kill pid %s" % pid)
1030 os.kill(pid, 15)
1031 pids.append(pid)
1033 time.sleep(1.0)
1034 for pid in pids:
1035 #try again in case any are lingerers
1036 try:
1037 os.kill(int(pid), 9)
1038 except OSError, e:
1039 log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1040 continue
1041 log('killing %s with -9' % pid)
1042 self.notify_watcher()
1044 def cleanup(self):
1045 self.cleanup_x()
1046 if not config.KEEP_TEMP_FILES:
1047 for fn in os.listdir(self.workdir):
1048 os.remove(os.path.join(self.workdir, fn))
1049 os.rmdir(self.workdir)
1050 else:
1051 log("NOT removing '%s', containing the following files:" % self.workdir)
1052 log(*os.listdir(self.workdir))
1054 self.notify_watcher()
1057 def use_cache():
1058 return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1060 def _read_cached_zip(server, book, max_age):
1061 #find a recent zip if possible
1062 prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1063 from glob import glob
1064 zips = sorted(glob(prefix + '*.zip'))
1065 if not zips:
1066 log("no cached booki-zips matching %s*.zip" % (prefix,))
1067 return None
1068 zipname = zips[-1]
1069 cutoff = time.time() - max_age * 60
1070 log(repr(zipname))
1071 try:
1072 date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1073 if date > cutoff:
1074 f = open(zipname)
1075 blob = f.read()
1076 f.close()
1077 return blob, zipname
1078 log("%s is too old, must reload" % zipname)
1079 return None
1080 except (IOError, IndexError, ValueError), e:
1081 log('could not make sense of %s: got exception %s' % (zipname, e))
1082 return None
1085 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1086 interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1087 try:
1088 url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1089 'server': server, 'book':book}
1090 except KeyError:
1091 raise NotImplementedError("Can't handle '%s' interface" % interface)
1093 if use_cache() and max_age < 0:
1094 #default to 12 hours cache on objavi.halo.gen.nz
1095 max_age = 12 * 60
1097 if max_age:
1098 log('WARNING: trying to use cached booki-zip',
1099 'If you are debugging booki-zip creation, you will go CRAZY'
1100 ' unless you switch this off')
1101 blob_and_name = _read_cached_zip(server, book, max_age)
1102 if blob_and_name is not None:
1103 return blob_and_name
1105 log('fetching zip from %s'% url)
1106 f = urlopen(url)
1107 blob = f.read()
1108 f.close()
1109 if save:
1110 if filename is None:
1111 filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1112 make_book_name(book, server, '.zip'))
1113 f = open(filename, 'w')
1114 f.write(blob)
1115 f.close()
1116 return blob, filename
1119 def split_html(html, compressed_size=None, fix_markup=False):
1120 """Split long html files into pieces that will work nicely on a
1121 Sony Reader."""
1122 if compressed_size is None:
1123 import zlib
1124 compressed_size = len(zlib.compress(html))
1126 splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1127 len(html) // config.EPUB_FILE_SIZE_MAX)
1128 log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1130 if not splits:
1131 return [html]
1133 if fix_markup:
1134 #remove '<' in attributes etc, which makes the marker
1135 #insertion more reliable
1136 html = etree.tostring(lxml.html.fromstring(html),
1137 encoding='UTF-8',
1138 #method='html'
1141 target = len(html) // (splits + 1)
1142 s = 0
1143 fragments = []
1144 for i in range(splits):
1145 e = html.find('<', target * (i + 1))
1146 fragments.append(html[s:e])
1147 fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1148 s = e
1149 fragments.append(html[s:])
1151 #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1152 chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1153 return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]