Fix various errors detected by pyflakes
[objavi2.git] / objavi / fmbook.py
blobc80a45e8817dd5c3fb7fa8ce87a09f42e5da6752
1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
21 PDF"""
23 import os, sys
24 import tempfile
25 import re, time
26 import random
27 from subprocess import Popen, check_call, PIPE
28 from cStringIO import StringIO
29 from urllib2 import urlopen, HTTPError
30 import zipfile
31 import traceback
32 from string import ascii_letters
33 from pprint import pformat
35 try:
36 import simplejson as json
37 except ImportError:
38 import json
40 import lxml.html
41 from lxml import etree
43 from objavi import config, epub_utils
44 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
45 from objavi.book_utils import ObjaviError
46 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
47 from objavi.epub import add_guts, _find_tag
48 from objavi.xhtml_utils import EpubChapter, split_tree
50 from iarchive import epub as ia_epub
51 from booki.bookizip import get_metadata, add_metadata
53 TMPDIR = os.path.abspath(config.TMPDIR)
54 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
55 HTTP_HOST = os.environ.get('HTTP_HOST', '')
56 PUBLISH_PATH = "%s/books/" % DOC_ROOT
58 def find_archive_urls(bookid, bookname):
59 s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
60 detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
61 return (s3url, detailsurl)
63 def _get_best_title(tocpoint):
64 if 'html_title' in tocpoint:
65 return tocpoint['html_title']
66 if 'title' in tocpoint:
67 return tocpoint['title']
68 return 'Untitled'
71 def _add_initial_number(e, n):
72 """Put a styled chapter number n at the beginning of element e."""
73 initial = e.makeelement("strong", Class="initial")
74 e.insert(0, initial)
75 initial.tail = ' '
76 if e.text is not None:
77 initial.tail += e.text
78 e.text = ''
79 initial.text = "%s." % n
81 def expand_toc(toc, depth=1, index=0):
82 """Reformat toc slightly for convenience"""
83 for item in toc:
84 url = item['url'].lstrip('/')
85 bits = url.split('#', 1)
86 filename = bits[0]
87 fragment = (bits[1] if len(bits) == 2 else None)
88 item['depth'] = depth
89 item["filename"] = filename
90 item["fragment"] = fragment
91 item["index"] = index
92 index += 1
93 if 'children' in item:
94 index = expand_toc(item['children'], depth + 1, index)
95 return index
97 def _serialise(rtoc, stoc, depth):
98 for item in rtoc:
99 url = item['url'].lstrip('/')
100 bits = url.split('#', 1)
101 filename = bits[0]
102 fragment = (bits[1] if len(bits) == 2 else None)
103 stoc.append({"depth": depth,
104 "title": item['title'],
105 "url": url,
106 "filename": filename,
107 "fragment": fragment,
108 "type": item['type']
110 if 'children' in item:
111 _serialise(item['children'], stoc, depth + 1)
114 def serialise_toc(rtoc):
115 """Take the recursive TOC structure and turn it into a list of
116 serial points. Reformat some things for convenience."""
117 stoc = []
118 _serialise(rtoc, stoc, 1)
119 for i, x in enumerate(stoc):
120 x['position'] = i
121 return stoc
123 def filename_toc_map(rtoc):
124 tocmap = {}
125 log(rtoc)
126 def traverse(toc):
127 for point in toc:
128 log(point.keys())
129 tocmap.setdefault(point['filename'], []).append(point)
130 if 'children' in point:
131 traverse(point['children'])
132 traverse(rtoc)
133 return tocmap
136 class Book(object):
137 page_numbers = 'latin'
138 preamble_page_numbers = 'roman'
140 def notify_watcher(self, message=None):
141 if self.watchers:
142 if message is None:
143 #message is the name of the caller
144 message = traceback.extract_stack(None, 2)[0][2]
145 log("notify_watcher called with '%s'" % message)
146 for w in self.watchers:
147 w(message)
149 def __enter__(self):
150 return self
152 def __exit__(self, exc_type, exc_value, traceback):
153 self.notify_watcher(config.FINISHED_MESSAGE)
154 self.cleanup()
155 #could deal with exceptions here and return true
158 def __init__(self, book, server, bookname,
159 page_settings=None, watchers=None, isbn=None,
160 license=config.DEFAULT_LICENSE, title=None,
161 max_age=0):
162 log("*** Starting new book %s ***" % bookname)
163 self.watchers = set()
164 if watchers is not None:
165 self.watchers.update(watchers)
166 self.notify_watcher('start')
167 self.bookname = bookname
168 self.book = book
169 self.server = server
170 self.cookie = ''.join(random.sample(ascii_letters, 10))
171 try:
172 blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
173 except HTTPError, e:
174 traceback.print_exc()
175 self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
176 #not much to do?
177 #raise 502 Bad Gateway ?
178 sys.exit()
179 f = StringIO(blob)
180 self.notify_watcher('fetch_zip')
181 self.store = zipfile.ZipFile(f, 'r')
182 self.info = json.loads(self.store.read('info.json'))
183 for k in ('manifest', 'metadata', 'spine', 'TOC'):
184 if k not in self.info:
185 raise ObjaviError('info.json of %s lacks vital element "%s"' %
186 (bookname, k))
187 #check types also?
189 self.metadata = self.info['metadata']
190 self.spine = self.info['spine']
191 self.manifest = self.info['manifest']
193 if server == config.LOCALHOST: # [DEPRECATED]
194 server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
195 book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
197 log(pformat(self.metadata))
198 self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
199 if not self.lang:
200 self.lang = guess_lang(server, book)
201 log('guessed lang as %s' % self.lang)
203 self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
204 if not self.toc_header:
205 self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
207 self.dir = get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0]
208 if not self.dir:
209 self.dir = guess_text_dir(server, book)
212 #Patch in the extra metadata. (lang and dir may be set from config)
213 #these should be read from zip -- so should go into zip?
214 for var, key, scheme, ns in (
215 (isbn, 'id', 'ISBN', config.DC),
216 (license, 'rights', 'License', config.DC),
217 (title, 'title', '', config.DC),
218 (self.lang, 'language', '', config.DC),
219 (self.dir, 'dir', '', config.FM),
221 if var is not None:
222 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
224 self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
225 self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
227 self.toc = self.info['TOC']
228 expand_toc(self.toc)
230 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
231 os.chmod(self.workdir, 0755)
233 self.body_html_file = self.filepath('body.html')
234 self.body_pdf_file = self.filepath('body.pdf')
235 self.preamble_html_file = self.filepath('preamble.html')
236 self.preamble_pdf_file = self.filepath('preamble.pdf')
237 self.tail_html_file = self.filepath('tail.html')
238 self.tail_pdf_file = self.filepath('tail.pdf')
239 self.isbn_pdf_file = None
240 self.pdf_file = self.filepath('final.pdf')
241 self.body_odt_file = self.filepath('body.odt')
243 self.publish_file = os.path.join(PUBLISH_PATH, bookname)
244 self.publish_url = os.path.join(config.PUBLISH_URL, bookname)
246 if page_settings is not None:
247 self.maker = PageSettings(**page_settings)
249 if title is not None:
250 self.title = title
251 else:
252 titles = get_metadata(self.metadata, 'title')
253 if titles:
254 self.title = titles[0]
255 else:
256 self.title = 'A Book About ' + self.book
258 self.notify_watcher()
261 if config.TRY_BOOK_CLEANUP_ON_DEL:
262 #Dont even define __del__ if it is not used.
263 _try_cleanup_on_del = True
264 def __del__(self):
265 if self._try_cleanup_on_del and os.path.exists(self.workdir):
266 self._try_cleanup_on_del = False #or else you can get in bad cycles
267 self.cleanup()
269 def get_tree_by_id(self, id):
270 """get an HTML tree from the given manifest ID"""
271 name = self.manifest[id]['url']
272 mimetype = self.manifest[id]['mimetype']
273 s = self.store.read(name)
274 f = StringIO(s)
275 if mimetype == 'text/html':
276 try:
277 tree = lxml.html.parse(f)
278 except etree.XMLSyntaxError, e:
279 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
280 (id, name, s[:20], e))
281 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
282 elif 'xml' in mimetype: #XXX or is this just asking for trouble?
283 tree = etree.parse(f)
284 else:
285 tree = f.read()
286 f.close()
287 return tree
289 def filepath(self, fn):
290 return os.path.join(self.workdir, fn)
292 def save_data(self, fn, data):
293 """Save without tripping up on unicode"""
294 if isinstance(data, unicode):
295 data = data.encode('utf8', 'ignore')
296 f = open(fn, 'w')
297 f.write(data)
298 f.close()
300 def save_tempfile(self, fn, data):
301 """Save the data in a temporary directory that will be cleaned
302 up when all is done. Return the absolute file path."""
303 fn = self.filepath(fn)
304 self.save_data(fn, data)
305 return fn
307 def make_oo_doc(self):
308 """Make an openoffice document, using the html2odt script."""
309 self.wait_for_xvfb()
310 html_text = etree.tostring(self.tree, method="html")
311 self.save_data(self.body_html_file, html_text)
312 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
313 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
314 os.rename(self.body_odt_file, self.publish_file)
315 self.notify_watcher()
317 def extract_pdf_outline(self):
318 #self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
319 debugf = self.filepath('outline.txt')
320 self.outline_contents, self.outline_text, number_of_pages = \
321 parse_outline(self.body_pdf_file, 1, debugf)
323 if not self.outline_contents:
324 #probably problems with international text. need a horrible hack
325 log('no outline: trying again with ascii headings')
326 import copy
327 tree = copy.deepcopy(self.tree)
328 titlemap = {}
329 for tag in ('h1', 'h2', 'h3', 'h4'):
330 for i, e in enumerate(tree.getiterator(tag)):
331 key = "%s_%s" % (tag, i)
332 titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
333 del e[:]
334 if tag == 'h1':
335 e = lxml.etree.SubElement(e, "strong", Class="initial")
336 e.text = key
337 log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
339 ascii_html_file = self.filepath('body-ascii-headings.html')
340 ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
341 html_text = lxml.etree.tostring(tree, method="html")
342 self.save_data(ascii_html_file, html_text)
343 self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
344 debugf = self.filepath('ascii_outline.txt')
345 ascii_contents, ascii_text, number_of_ascii_pages = \
346 parse_outline(ascii_pdf_file, 1, debugf)
347 self.outline_contents = []
348 log ("number of pages: %s, post ascii: %s" %
349 (number_of_pages, number_of_ascii_pages))
350 for ascii_title, depth, pageno in ascii_contents:
351 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
352 ascii_title = ascii_title[:-4]
353 if ' ' in ascii_title:
354 ascii_title = ascii_title.rsplit(' ', 1)[1]
355 title = titlemap.get(ascii_title, '')
356 log((ascii_title, title, depth, pageno))
358 self.outline_contents.append((title, depth, pageno))
359 else:
360 for x in self.outline_contents:
361 log(x)
363 self.notify_watcher()
364 return number_of_pages
366 def make_body_pdf(self):
367 """Make a pdf of the HTML, using webkit"""
368 #1. Save the html
369 html_text = etree.tostring(self.tree, method="html")
370 self.save_data(self.body_html_file, html_text)
372 #2. Make a pdf of it
373 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
374 self.notify_watcher('generate_pdf')
376 n_pages = self.extract_pdf_outline()
378 log ("found %s pages in pdf" % n_pages)
379 #4. resize pages, shift gutters, even pages
380 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
381 self.notify_watcher('reshape_pdf')
383 #5 add page numbers
384 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
385 numbers=self.page_numbers)
386 self.notify_watcher("number_pdf")
387 self.notify_watcher()
389 def make_preamble_pdf(self):
390 contents = self.make_contents()
391 inside_cover_html = self.compose_inside_cover()
392 log(self.dir, self.css_url, self.title, inside_cover_html,
393 self.toc_header, contents, self.title)
395 html = ('<html dir="%s"><head>\n'
396 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
397 '<link rel="stylesheet" href="%s" />\n'
398 '</head>\n<body>\n'
399 '<h1 class="frontpage">%s</h1>'
400 '%s\n'
401 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
402 '<div style="page-break-after: always; color:#fff" class="unseen">.'
403 '<!--%s--></div></body></html>'
404 ) % (self.dir, self.css_url, self.title, inside_cover_html.decode('utf-8'),
405 self.toc_header, contents, self.title)
406 self.save_data(self.preamble_html_file, html)
408 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
410 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
412 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
413 numbers=self.preamble_page_numbers,
414 number_start=-2)
416 self.notify_watcher()
418 def make_end_matter_pdf(self):
419 """Make an inside back cover and a back cover. If there is an
420 isbn number its barcode will be put on the back cover."""
421 if self.isbn:
422 self.isbn_pdf_file = self.filepath('isbn.pdf')
423 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
424 self.notify_watcher('make_barcode_pdf')
426 end_matter = self.compose_end_matter()
427 log(end_matter)
428 self.save_data(self.tail_html_file, end_matter.decode('utf-8'))
429 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
431 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
432 centre_end=True, even_pages=False)
433 self.notify_watcher()
435 def make_book_pdf(self):
436 """A convenient wrapper of a few necessary steps"""
437 # now the Xvfb server is needed. make sure it has had long enough to get going
438 self.wait_for_xvfb()
439 self.make_body_pdf()
440 self.make_preamble_pdf()
441 self.make_end_matter_pdf()
443 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
444 self.body_pdf_file, self.tail_pdf_file,
445 self.isbn_pdf_file)
447 self.notify_watcher('concatenated_pdfs')
450 def make_simple_pdf(self, mode):
451 """Make a simple pdf document without contents or separate
452 title page. This is used for multicolumn newspapers and for
453 web-destined pdfs."""
454 self.wait_for_xvfb()
455 #0. Add heading to begining of html
456 body = list(self.tree.cssselect('body'))[0]
457 e = body.makeelement('h1', {'id': 'book-title'})
458 e.text = self.title
459 body.insert(0, e)
460 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
461 e.addnext(intro)
463 #0.5 adjust parameters to suit the particular kind of output
464 if mode == 'web':
465 self.maker.gutter = 0
467 #1. Save the html
468 html_text = etree.tostring(self.tree, method="html")
469 self.save_data(self.body_html_file, html_text)
471 #2. Make a pdf of it (direct to to final pdf)
472 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
473 self.notify_watcher('generate_pdf')
474 n_pages = count_pdf_pages(self.pdf_file)
476 if mode != 'web':
477 #3. resize pages and shift gutters.
478 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
479 self.notify_watcher('reshape_pdf')
481 #4. add page numbers
482 self.maker.number_pdf(self.pdf_file, n_pages,
483 dir=self.dir, numbers=self.page_numbers)
484 self.notify_watcher("number_pdf")
485 self.notify_watcher()
488 def rotate180(self):
489 """Rotate the pdf 180 degrees so an RTL book can print on LTR
490 presses."""
491 rotated = self.filepath('final-rotate.pdf')
492 unrotated = self.filepath('final-pre-rotate.pdf')
493 #leave the unrotated pdf intact at first, in case of error.
494 rotate_pdf(self.pdf_file, rotated)
495 os.rename(self.pdf_file, unrotated)
496 os.rename(rotated, self.pdf_file)
497 self.notify_watcher()
499 def publish_pdf(self):
500 """Move the finished PDF to its final resting place"""
501 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
502 os.rename(self.pdf_file, self.publish_file)
503 self.notify_watcher()
505 def publish_bookizip(self):
506 """Publish the bookizip. For this, copy rather than move,
507 because the bookizip might be used by further processing. If
508 possible, a hard link is created."""
509 log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
510 try:
511 run(['cp', '-l', self.bookizip_file, self.publish_file])
512 except OSError, e:
513 run(['cp', self.bookizip_file, self.publish_file])
514 self.notify_watcher()
516 def concat_html(self):
517 """Join all the chapters together into one tree. Keep the TOC
518 up-to-date along the way."""
520 #each manifest item looks like:
521 #{'contributors': []
522 #'license': [],
523 #'mimetype': '',
524 #'rightsholders': []
525 #'url': ''}
526 doc = lxml.html.document_fromstring('<html><body></body></html>')
527 tocmap = filename_toc_map(self.toc)
528 for ID in self.spine:
529 details = self.manifest[ID]
530 log(ID, pformat(details))
531 # ACO MIJENJAO
532 try:
533 root = self.get_tree_by_id(ID).getroot()
534 except:
535 continue
536 #handle any TOC points in this file
537 for point in tocmap[details['url']]:
538 #if the url has a #identifier, use it. Otherwise, make
539 #one up, using a hidden element at the beginning of
540 #the inserted document.
541 #XXX this will break if different files use the same ids
542 #XXX should either replace all, or replace selectively.
543 if point['fragment']:
544 fragment = point['fragment']
545 else:
546 body = _find_tag(root, 'body')
547 fragment = '%s_%s' % (self.cookie, point['index'])
548 #reuse first tag if it is suitable.
549 if (len(body) and
550 body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
551 if body[0].get('id') is None:
552 body[0].set('id', fragment)
553 else:
554 fragment = body[0].get('id')
555 #the chapter starts with a heading. that heading should be the chapter name.
556 if body[0].tag in ('h1', 'h2', 'h3'):
557 log('chapter has title "%s", found html title "%s"' %
558 (point['title'], body[0].text_content()))
559 point['html_title'] = body[0].text_content()
560 else:
561 marker = body.makeelement('div', style="display:none",
562 id=fragment)
563 body.insert(0, marker)
564 point['html_id'] = fragment
566 add_guts(root, doc)
567 return doc
569 def unpack_static(self):
570 """Extract static files from the zip for the html to refer to."""
571 static_files = [x['url'] for x in self.manifest.values()
572 if x['url'].startswith('static')]
573 if static_files:
574 os.mkdir(self.filepath('static'))
576 for name in static_files:
577 s = self.store.read(name)
578 f = open(self.filepath(name), 'w')
579 f.write(s)
580 f.close()
581 self.notify_watcher()
583 def load_book(self):
584 """"""
585 #XXX concatenate the HTML to match how TWiki version worked.
586 # This is perhaps foolishly early -- throwing away useful boundaries.
587 self.unpack_static()
588 self.tree = self.concat_html()
589 self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
591 self.headings = [x for x in self.tree.cssselect('h1')]
592 if self.headings:
593 self.headings[0].set('class', "first-heading")
594 for h1 in self.headings:
595 h1.title = h1.text_content().strip()
596 self.notify_watcher()
598 def make_contents(self):
599 """Generate HTML containing the table of contents. This can
600 only be done after the main PDF has been made, because the
601 page numbers are contained in the PDF outline."""
602 header = '<h1>Table of Contents</h1><table class="toc">\n'
603 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
604 '<td class="pagenumber">%s</td></tr>\n')
605 empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
606 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
607 footer = '\n</table>'
609 contents = []
611 chapter = 1
612 page_num = 1
613 subsections = [] # for the subsection heading pages.
615 outline_contents = iter(self.outline_contents)
616 headings = iter(self.headings)
618 for section in self.toc:
619 if not section.get('children'):
620 contents.append(empty_section_tmpl % section['title'])
621 continue
622 contents.append(section_tmpl % section['title'])
624 for point in section['children']:
625 try:
626 h1_text, level, page_num = outline_contents.next()
627 except StopIteration:
628 log("contents data not found for %s. Stopping" % (point,))
629 break
630 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
631 chapter += 1
633 doc = header + '\n'.join(contents) + footer
634 self.notify_watcher()
635 return doc
637 def add_section_titles(self):
638 """Add any section heading pages that the TOC.txt file
639 specifies. These are sub-book, super-chapter groupings.
641 Also add initial numbers to chapters.
643 headings = iter(self.headings)
644 chapter = 1
645 section = None
646 log(self.toc)
647 for t in self.toc:
648 #only top level sections get a subsection page,
649 #and only if they have children.
650 if t.get('children'):
651 section = self.tree.makeelement('div', Class="objavi-subsection")
652 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
653 heading.text = t['title']
654 for child in t['children']:
655 item = etree.SubElement(section, 'div', Class="objavi-chapter")
656 if 'html_title' in child:
657 item.text = child['html_title']
658 heading = self.tree.cssselect('#'+ child['html_id'])
659 if heading:
660 _add_initial_number(heading[0], chapter)
661 else:
662 item.text = child['title']
663 _add_initial_number(item, chapter)
664 log(item.text, debug='HTMLGEN')
665 chapter += 1
666 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
667 location = self.tree.cssselect('#'+ t['html_id'])[0]
668 location.addprevious(section)
671 self.notify_watcher()
674 def add_css(self, css=None, mode='book'):
675 """If css looks like a url, use it as a stylesheet link.
676 Otherwise it is the CSS itself, which is saved to a temporary file
677 and linked to."""
678 log("css is %r" % css)
679 htmltree = self.tree
680 if css is None or not css.strip():
681 css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
682 if css_default is None:
683 #guess from language -- this should come first
684 css_modes = config.LANGUAGE_CSS.get(self.lang,
685 config.LANGUAGE_CSS['en'])
686 css_default = css_modes.get(mode, css_modes[None])
687 url = 'file://' + os.path.abspath(css_default)
688 elif not re.match(r'^http://\S+$', css):
689 fn = self.save_tempfile('objavi.css', css)
690 url = 'file://' + fn
691 else:
692 url = css
693 #XXX for debugging and perhaps sensible anyway
694 #url = url.replace('file:///home/douglas/objavi2', '')
697 #find the head -- it's probably first child but lets not assume.
698 for child in htmltree:
699 if child.tag == 'head':
700 head = child
701 break
702 else:
703 head = htmltree.makeelement('head')
704 htmltree.insert(0, head)
706 link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
707 self.css_url = url
708 self.notify_watcher()
709 return url
712 def _read_localised_template(self, template, fallbacks=['en']):
713 """Try to get the template in the approriate language, otherwise in english."""
714 for lang in [self.lang] + fallbacks:
715 try:
716 fn = template % (lang)
717 f = open(fn)
718 break
719 except IOError, e:
720 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
721 log(e)
722 template = f.read()
723 f.close()
724 return template
726 def compose_inside_cover(self):
727 """create the markup for the preamble inside cover."""
728 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
730 if self.isbn:
731 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
732 else:
733 isbn_text = ''
735 return template % {'date': time.strftime('%Y-%m-%d'),
736 'isbn': isbn_text,
737 'license': self.license,
741 def compose_end_matter(self):
742 """create the markup for the end_matter inside cover. If
743 self.isbn is not set, the html will result in a pdf that
744 spills onto two pages.
746 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
748 d = {'css_url': self.css_url,
749 'title': self.title
752 if self.isbn:
753 d['inside_cover_style'] = ''
754 else:
755 d['inside_cover_style'] = 'page-break-after: always'
757 return template % d
760 def make_epub(self, use_cache=False):
761 """Make an epub version of the book, using Mike McCabe's
762 epub module for the Internet Archive."""
763 ebook = ia_epub.Book(self.publish_file, content_dir='')
764 def add_file(ID, filename, mediatype, content):
765 ebook.add_content({'media-type': mediatype.encode('utf-8'),
766 'id': ID.encode('utf-8'),
767 'href': filename.encode('utf-8'),
768 }, content)
770 toc = self.info['TOC']
772 #manifest
773 filemap = {} #map html to corresponding xhtml
774 spinemap = {} #map IDs to multi-file chapters
775 for ID in self.manifest:
776 details = self.manifest[ID]
777 log(ID, pformat(details))
778 fn, mediatype = details['url'], details['mimetype']
779 content = self.store.read(fn)
780 if mediatype == 'text/html':
781 #convert to application/xhtml+xml, and perhaps split
782 c = EpubChapter(self.server, self.book, ID, content,
783 use_cache=use_cache)
784 c.remove_bad_tags()
785 if fn[-5:] == '.html':
786 fnbase = fn[:-5]
787 else:
788 fnbase = fn
789 fnx = fnbase + '.xhtml'
790 mediatype = 'application/xhtml+xml'
792 fragments = split_html(c.as_xhtml(),
793 compressed_size=self.store.getinfo(fn).compress_size)
795 #add the first one as if it is the whole thing (as it often is)
796 add_file(ID, fnx, mediatype, fragments[0])
797 filemap[fn] = fnx
798 if len(fragments) > 1:
799 spine_ids = [ID]
800 spinemap[ID] = spine_ids
801 #add any extras
802 for i in range(1, len(fragments)):
803 # XXX it is possible for duplicates if another
804 # file happens to have this name. Ignore for now
805 _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
806 spine_ids.append(_id)
807 add_file(_id,
808 '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
809 mediatype, fragments[i])
811 else:
812 add_file(ID, fn, mediatype, content)
814 #toc
815 ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
816 ebook.add(ebook.content_dir + 'toc.ncx', ncx)
818 #spine
819 for ID in self.spine:
820 if ID in spinemap:
821 for x in spinemap[ID]:
822 ebook.add_spine_item({'idref': x})
823 else:
824 ebook.add_spine_item({'idref': ID})
826 #metadata -- no use of attributes (yet)
827 # and fm: metadata disappears for now
828 DCNS = config.DCNS
829 DC = config.DC
830 meta_info_items = []
831 for ns, namespace in self.metadata.items():
832 for keyword, schemes in namespace.items():
833 if ns:
834 keyword = '{%s}%s' % (ns, keyword)
835 for scheme, values in schemes.items():
836 for value in values:
837 item = {
838 'item': keyword,
839 'text': value,
841 if scheme:
842 if keyword in (DCNS + 'creator', DCNS + 'contributor'):
843 item['atts'] = {'role': scheme}
844 else:
845 item['atts'] = {'scheme': scheme}
847 has_authors = 'creator' in self.metadata[DC]
848 if not has_authors and config.CLAIM_UNAUTHORED:
849 authors = []
850 for x in self.metadata[DC]['creator'].values():
851 authors.extend(x)
853 meta_info_items.append({'item': DCNS + 'creator',
854 'text': 'The Contributors'})
856 meta_info_items.append({'item': DCNS + 'rights',
857 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
860 tree_str = ia_epub.make_opf(meta_info_items,
861 ebook.manifest_items,
862 ebook.spine_items,
863 ebook.guide_items,
864 ebook.cover_id)
865 ebook.add(ebook.content_dir + 'content.opf', tree_str)
866 ebook.z.close()
867 self.notify_watcher()
870 def publish_s3(self):
871 """Push the book's epub to archive.org, using S3."""
872 #XXX why only epub?
873 secrets = {}
874 for x in ('S3_SECRET', 'S3_ACCESSKEY'):
875 fn = getattr(config, x)
876 f = open(fn)
877 secrets[x] = f.read().strip()
878 f.close()
880 log(secrets)
881 now = time.strftime('%F')
882 s3output = self.filepath('s3-output.txt')
883 s3url, detailsurl = find_archive_urls(self.book, self.bookname)
884 headers = [
885 'x-amz-auto-make-bucket:1',
886 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
887 'x-archive-meta-mediatype:texts',
888 'x-archive-meta-collection:opensource',
889 'x-archive-meta-title:%s' %(self.book,),
890 'x-archive-meta-date:%s' % (now,),
891 'x-archive-meta-creator:FLOSS Manuals Contributors',
894 if self.license in config.LICENSES:
895 headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
897 argv = ['curl', '--location', '-s', '-o', s3output]
898 for h in headers:
899 argv.extend(('--header', h))
900 argv.extend(('--upload-file', self.publish_file, s3url,))
902 log(' '.join(repr(x) for x in argv))
903 check_call(argv, stdout=sys.stderr)
904 self.notify_watcher()
905 return detailsurl, s3url
908 def spawn_x(self):
909 """Start an Xvfb instance, using a new server number. A
910 reference to it is stored in self.xvfb, which is used to kill
911 it when the pdf is done.
913 Note that Xvfb doesn't interact well with dbus which is
914 present on modern desktops.
916 #Find an unused server number (in case two cgis are running at once)
917 while True:
918 servernum = random.randrange(50, 500)
919 if not os.path.exists('/tmp/.X%s-lock' % servernum):
920 break
922 self.xserver_no = ':%s' % servernum
924 authfile = self.filepath('Xauthority')
925 os.environ['XAUTHORITY'] = authfile
927 #mcookie(1) eats into /dev/random, so avoid that
928 from hashlib import md5
929 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
930 mcookie = m.hexdigest()
932 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
934 self.xvfb = Popen(['Xvfb', self.xserver_no,
935 '-screen', '0', '1024x768x24',
936 '-pixdepths', '32',
937 #'-blackpixel', '0',
938 #'-whitepixel', str(2 ** 24 -1),
939 #'+extension', 'Composite',
940 '-dpi', '96',
941 '-kb',
942 '-nolisten', 'tcp',
945 # We need to wait a bit before the Xvfb is ready. but the
946 # downloads are so slow that that probably doesn't matter
948 self.xvfb_ready_time = time.time() + 2
950 os.environ['DISPLAY'] = self.xserver_no
951 log(self.xserver_no)
953 def wait_for_xvfb(self):
954 """wait until a previously set time before continuing. This
955 is so Xvfb has time to properly start."""
956 if hasattr(self, 'xvfb'):
957 d = self.xvfb_ready_time - time.time()
958 if d > 0:
959 time.sleep(d)
960 self.notify_watcher()
962 def cleanup_x(self):
963 """Try very hard to kill off Xvfb. In addition to killing
964 this instance's xvfb, occasionally (randomly) search for
965 escaped Xvfb instances and kill those too."""
966 if not hasattr(self, 'xvfb'):
967 return
968 check_call(['xauth', 'remove', self.xserver_no])
969 p = self.xvfb
970 log("trying to kill Xvfb %s" % p.pid)
971 os.kill(p.pid, 15)
972 for i in range(10):
973 if p.poll() is not None:
974 log("%s died with %s" % (p.pid, p.poll()))
975 break
976 log("%s not dead yet" % p.pid)
977 time.sleep(0.2)
978 else:
979 log("Xvfb would not die! kill -9! kill -9!")
980 os.kill(p.pid, 9)
982 if random.random() < 0.1:
983 # occasionally kill old xvfbs and soffices, if there are any.
984 self.kill_old_processes()
986 def kill_old_processes(self):
987 """Sometimes, despite everything, Xvfb or soffice instances
988 hang around well after they are wanted -- for example if the
989 cgi process dies particularly badly. So kill them if they have
990 been running for a long time."""
991 log("running kill_old_processes")
992 p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
993 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
994 data = p.communicate()[0].strip()
995 if data:
996 lines = data.split('\n')
997 pids = []
998 for line in lines:
999 log('dealing with ps output "%s"' % line)
1000 try:
1001 pid, days, hours, minutes, seconds \
1002 = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1003 except AttributeError:
1004 log("Couldn't parse that line!")
1005 # 50 minutes should be enough xvfb time for anyone
1006 if days or hours or int(minutes) > 50:
1007 pid = int(pid)
1008 log("going to kill pid %s" % pid)
1009 os.kill(pid, 15)
1010 pids.append(pid)
1012 time.sleep(1.0)
1013 for pid in pids:
1014 #try again in case any are lingerers
1015 try:
1016 os.kill(int(pid), 9)
1017 except OSError, e:
1018 log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1019 continue
1020 log('killing %s with -9' % pid)
1021 self.notify_watcher()
1023 def cleanup(self):
1024 self.cleanup_x()
1025 if not config.KEEP_TEMP_FILES:
1026 for fn in os.listdir(self.workdir):
1027 os.remove(os.path.join(self.workdir, fn))
1028 os.rmdir(self.workdir)
1029 else:
1030 log("NOT removing '%s', containing the following files:" % self.workdir)
1031 log(*os.listdir(self.workdir))
1033 self.notify_watcher()
1036 def use_cache():
1037 return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1039 def _read_cached_zip(server, book, max_age):
1040 #find a recent zip if possible
1041 prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1042 from glob import glob
1043 zips = sorted(glob(prefix + '*.zip'))
1044 if not zips:
1045 log("no cached booki-zips matching %s*.zip" % (prefix,))
1046 return None
1047 zipname = zips[-1]
1048 cutoff = time.time() - max_age * 60
1049 log(repr(zipname))
1050 try:
1051 date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1052 if date > cutoff:
1053 f = open(zipname)
1054 blob = f.read()
1055 f.close()
1056 return blob, zipname
1057 log("%s is too old, must reload" % zipname)
1058 return None
1059 except (IOError, IndexError, ValueError), e:
1060 log('could not make sense of %s: got exception %s' % (zipname, e))
1061 return None
1064 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1065 interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1066 try:
1067 url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1068 'server': server, 'book':book}
1069 except KeyError:
1070 raise NotImplementedError("Can't handle '%s' interface" % interface)
1072 if use_cache() and max_age < 0:
1073 #default to 12 hours cache on objavi.halo.gen.nz
1074 max_age = 12 * 60
1076 if max_age:
1077 log('WARNING: trying to use cached booki-zip',
1078 'If you are debugging booki-zip creation, you will go CRAZY'
1079 ' unless you switch this off')
1080 blob_and_name = _read_cached_zip(server, book, max_age)
1081 if blob_and_name is not None:
1082 return blob_and_name
1084 log('fetching zip from %s'% url)
1085 f = urlopen(url)
1086 blob = f.read()
1087 f.close()
1088 if save:
1089 if filename is None:
1090 filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1091 make_book_name(book, server, '.zip'))
1092 f = open(filename, 'w')
1093 f.write(blob)
1094 f.close()
1095 return blob, filename
1098 def split_html(html, compressed_size=None, fix_markup=False):
1099 """Split long html files into pieces that will work nicely on a
1100 Sony Reader."""
1101 if compressed_size is None:
1102 import zlib
1103 compressed_size = len(zlib.compress(html))
1105 splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1106 len(html) // config.EPUB_FILE_SIZE_MAX)
1107 log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1109 if not splits:
1110 return [html]
1112 if fix_markup:
1113 #remove '<' in attributes etc, which makes the marker
1114 #insertion more reliable
1115 html = etree.tostring(lxml.html.fromstring(html),
1116 encoding='UTF-8',
1117 #method='html'
1120 target = len(html) // (splits + 1)
1121 s = 0
1122 fragments = []
1123 for i in range(splits):
1124 e = html.find('<', target * (i + 1))
1125 fragments.append(html[s:e])
1126 fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1127 s = e
1128 fragments.append(html[s:])
1130 #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1131 chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1132 return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]