need to add text as unicode, not utf-8
[objavi2.git] / objavi / fmbook.py
blob5f81e059872bcf688ba3a4347dc3f794de2e2d0b
1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
21 PDF"""
23 import os, sys
24 import tempfile
25 import re, time
26 import random
27 import copy
28 from subprocess import Popen, check_call, PIPE
29 from cStringIO import StringIO
30 from urllib2 import urlopen, HTTPError
31 import zipfile
32 import traceback
33 from string import ascii_letters
34 from pprint import pformat
36 try:
37 import json
38 except ImportError:
39 import simplejson as json
41 import lxml.html
42 from lxml import etree
44 from objavi import config, epub_utils
45 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
46 from objavi.book_utils import ObjaviError, log_types
47 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline, parse_extracted_outline
48 from objavi.epub import add_guts, _find_tag
49 from objavi.xhtml_utils import EpubChapter, split_tree
50 from objavi.cgi_utils import url2path, path2url
52 from iarchive import epub as ia_epub
53 from booki.bookizip import get_metadata, add_metadata
55 TMPDIR = os.path.abspath(config.TMPDIR)
56 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
57 HTTP_HOST = os.environ.get('HTTP_HOST', '')
59 def find_archive_urls(bookid, bookname):
60 s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
61 detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
62 return (s3url, detailsurl)
64 def _get_best_title(tocpoint):
65 if 'html_title' in tocpoint:
66 return tocpoint['html_title']
67 if 'title' in tocpoint:
68 return tocpoint['title']
69 return 'Untitled'
72 def _add_initial_number(e, n):
73 """Put a styled chapter number n at the beginning of element e."""
74 initial = e.makeelement("strong", Class="initial")
75 e.insert(0, initial)
76 initial.tail = ' '
77 if e.text is not None:
78 initial.tail += e.text
79 e.text = ''
80 initial.text = "%s." % n
82 def expand_toc(toc, depth=1, index=0):
83 """Reformat toc slightly for convenience"""
84 for item in toc:
85 url = item['url'].lstrip('/')
86 bits = url.split('#', 1)
87 filename = bits[0]
88 fragment = (bits[1] if len(bits) == 2 else None)
89 item['depth'] = depth
90 item["filename"] = filename
91 item["fragment"] = fragment
92 item["index"] = index
93 index += 1
94 if 'children' in item:
95 index = expand_toc(item['children'], depth + 1, index)
96 return index
98 def _serialise(rtoc, stoc, depth):
99 for item in rtoc:
100 url = item['url'].lstrip('/')
101 bits = url.split('#', 1)
102 filename = bits[0]
103 fragment = (bits[1] if len(bits) == 2 else None)
104 stoc.append({"depth": depth,
105 "title": item['title'],
106 "url": url,
107 "filename": filename,
108 "fragment": fragment,
109 "type": item['type']
111 if 'children' in item:
112 _serialise(item['children'], stoc, depth + 1)
115 def serialise_toc(rtoc):
116 """Take the recursive TOC structure and turn it into a list of
117 serial points. Reformat some things for convenience."""
118 stoc = []
119 _serialise(rtoc, stoc, 1)
120 for i, x in enumerate(stoc):
121 x['position'] = i
122 return stoc
124 def filename_toc_map(rtoc):
125 tocmap = {}
126 #log(rtoc)
127 def traverse(toc):
128 for point in toc:
129 #log(point.keys())
130 tocmap.setdefault(point['filename'], []).append(point)
131 if 'children' in point:
132 traverse(point['children'])
133 traverse(rtoc)
134 return tocmap
136 def save_data(fn, data):
137 """Save without tripping up on unicode"""
138 if isinstance(data, unicode):
139 data = data.encode('utf8', 'ignore')
140 f = open(fn, 'w')
141 f.write(data)
142 f.close()
145 class Book(object):
146 page_numbers = 'latin'
147 preamble_page_numbers = 'roman'
149 def notify_watcher(self, message=None):
150 if self.watchers:
151 if message is None:
152 #message is the name of the caller
153 message = traceback.extract_stack(None, 2)[0][2]
154 log("notify_watcher called with '%s'" % message)
155 for w in self.watchers:
156 w(message)
158 def __enter__(self):
159 return self
161 def __exit__(self, exc_type, exc_value, tb):
162 self.notify_watcher(config.FINISHED_MESSAGE)
163 self.cleanup()
164 #could deal with exceptions here and return true
167 def __init__(self, book, server, bookname,
168 page_settings=None, watchers=None, isbn=None,
169 license=config.DEFAULT_LICENSE, title=None,
170 max_age=0):
171 log("*** Starting new book %s ***" % bookname)
172 self.watchers = set()
173 if watchers is not None:
174 self.watchers.update(watchers)
175 self.notify_watcher('start')
176 self.bookname = bookname
177 self.book = book
178 self.server = server
179 self.cookie = ''.join(random.sample(ascii_letters, 10))
180 try:
181 blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
182 except HTTPError, e:
183 traceback.print_exc()
184 self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
185 #not much to do?
186 #raise 502 Bad Gateway ?
187 sys.exit()
188 f = StringIO(blob)
189 self.notify_watcher('fetch_zip')
190 self.store = zipfile.ZipFile(f, 'r')
191 self.info = json.loads(self.store.read('info.json'))
192 for k in ('manifest', 'metadata', 'spine', 'TOC'):
193 if k not in self.info:
194 raise ObjaviError('info.json of %s lacks vital element "%s"' %
195 (bookname, k))
196 #check types also?
198 self.metadata = self.info['metadata']
199 self.spine = self.info['spine']
200 self.manifest = self.info['manifest']
202 if server == config.LOCALHOST: # [DEPRECATED]
203 server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
204 book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
206 log(pformat(self.metadata))
207 self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
208 if not self.lang:
209 self.lang = guess_lang(server, book)
210 log('guessed lang as %s' % self.lang)
212 self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
213 if not self.toc_header:
214 self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
216 self.dir = str(get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0])
217 if not self.dir:
218 self.dir = guess_text_dir(server, book)
220 #Patch in the extra metadata. (lang and dir may be set from config)
221 #these should be read from zip -- so should go into zip?
222 for var, key, scheme, ns in (
223 (isbn, 'id', 'ISBN', config.DC),
224 (license, 'rights', 'License', config.DC),
225 (title, 'title', '', config.DC),
226 (self.lang, 'language', '', config.DC),
227 (self.dir, 'dir', '', config.FM),
229 if var is not None:
230 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
232 self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
233 self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
235 self.toc = self.info['TOC']
236 expand_toc(self.toc)
238 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
239 os.chmod(self.workdir, 0755)
241 self.body_html_file = self.filepath('body.html')
242 self.body_pdf_file = self.filepath('body.pdf')
243 self.preamble_html_file = self.filepath('preamble.html')
244 self.preamble_pdf_file = self.filepath('preamble.pdf')
245 self.tail_html_file = self.filepath('tail.html')
246 self.tail_pdf_file = self.filepath('tail.pdf')
247 self.isbn_pdf_file = None
248 self.pdf_file = self.filepath('final.pdf')
249 self.body_odt_file = self.filepath('body.odt')
250 self.outline_file = self.filepath('outline.txt')
252 self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
254 if page_settings is not None:
255 self.maker = PageSettings(**page_settings)
257 if title is not None:
258 self.title = title
259 else:
260 titles = get_metadata(self.metadata, 'title')
261 if titles:
262 self.title = titles[0]
263 else:
264 self.title = 'A Book About ' + self.book
265 if isinstance(self.title, unicode):
266 self.title = self.title.encode('utf-8')
268 self.notify_watcher()
271 if config.TRY_BOOK_CLEANUP_ON_DEL:
272 #Dont even define __del__ if it is not used.
273 _try_cleanup_on_del = True
274 def __del__(self):
275 if self._try_cleanup_on_del and os.path.exists(self.workdir):
276 self._try_cleanup_on_del = False #or else you can get in bad cycles
277 self.cleanup()
279 def get_tree_by_id(self, id):
280 """get an HTML tree from the given manifest ID"""
281 name = self.manifest[id]['url']
282 mimetype = self.manifest[id]['mimetype']
283 s = self.store.read(name)
284 f = StringIO(s)
285 if mimetype == 'text/html':
286 try:
287 tree = lxml.html.parse(f)
288 except etree.XMLSyntaxError, e:
289 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
290 (id, name, s[:20], e))
291 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
292 elif 'xml' in mimetype: #XXX or is this just asking for trouble?
293 tree = etree.parse(f)
294 else:
295 tree = f.read()
296 f.close()
297 return tree
299 def filepath(self, fn):
300 return os.path.join(self.workdir, fn)
302 def save_tempfile(self, fn, data):
303 """Save the data in a temporary directory that will be cleaned
304 up when all is done. Return the absolute file path."""
305 fn = self.filepath(fn)
306 save_data(fn, data)
307 return fn
309 def make_oo_doc(self):
310 """Make an openoffice document, using the html2odt script."""
311 self.wait_for_xvfb()
312 html_text = etree.tostring(self.tree, method="html")
313 save_data(self.body_html_file, html_text)
314 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
315 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
316 os.rename(self.body_odt_file, self.publish_file)
317 self.notify_watcher()
319 def extract_pdf_outline(self):
320 """Get the outline (table of contents) for the PDF, which
321 wkhtmltopdf should have written to a file. If that file
322 doesn't exist (or config says not to use it), fall back to
323 using self._extract_pdf_outline_the_old_way, below.
325 if config.USE_DUMP_OUTLINE:
326 try:
327 self.outline_contents, number_of_pages = \
328 parse_extracted_outline(self.outline_file)
330 except Exception, e:
331 traceback.print_exc()
332 number_of_pages = self._extract_pdf_outline_the_old_way()
333 else:
334 number_of_pages = self._extract_pdf_outline_the_old_way()
336 self.notify_watcher()
337 return number_of_pages
339 def _extract_pdf_outline_the_old_way(self):
340 """Try to get the PDF outline using pdftk. This doesn't work
341 well with all scripts."""
342 debugf = self.filepath('extracted-outline.txt')
343 self.outline_contents, number_of_pages = \
344 parse_outline(self.body_pdf_file, 1, debugf)
346 if not self.outline_contents:
347 #probably problems with international text. need a horrible hack
348 log('no outline: trying again with ascii headings')
349 import copy
350 tree = copy.deepcopy(self.tree)
351 titlemap = {}
352 for tag in ('h1', 'h2', 'h3', 'h4'):
353 for i, e in enumerate(tree.getiterator(tag)):
354 key = "%s_%s" % (tag, i)
355 titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
356 del e[:]
357 if tag == 'h1':
358 e = lxml.etree.SubElement(e, "strong", Class="initial")
359 e.text = key
360 log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
362 ascii_html_file = self.filepath('body-ascii-headings.html')
363 ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
364 html_text = lxml.etree.tostring(tree, method="html")
365 save_data(ascii_html_file, html_text)
366 self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
367 debugf = self.filepath('ascii-extracted-outline.txt')
368 ascii_contents, number_of_ascii_pages = \
369 parse_outline(ascii_pdf_file, 1, debugf)
370 self.outline_contents = []
371 log ("number of pages: %s, post ascii: %s" %
372 (number_of_pages, number_of_ascii_pages))
373 for ascii_title, depth, pageno in ascii_contents:
374 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
375 ascii_title = ascii_title[:-4]
376 if ' ' in ascii_title:
377 ascii_title = ascii_title.rsplit(' ', 1)[1]
378 title = titlemap.get(ascii_title, '')
379 log((ascii_title, title, depth, pageno))
381 self.outline_contents.append((title, depth, pageno))
383 return number_of_pages
385 def make_body_pdf(self):
386 """Make a pdf of the HTML, using webkit"""
387 #1. Save the html
388 html_text = etree.tostring(self.tree, method="html")
389 save_data(self.body_html_file, html_text)
391 #2. Make a pdf of it
392 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True, outline_file=self.outline_file)
393 self.notify_watcher('generate_pdf')
395 n_pages = self.extract_pdf_outline()
397 log ("found %s pages in pdf" % n_pages)
398 #4. resize pages, shift gutters, even pages
399 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
400 self.notify_watcher('reshape_pdf')
402 #5 add page numbers
403 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
404 numbers=self.page_numbers)
405 self.notify_watcher("number_pdf")
406 self.notify_watcher()
408 def make_preamble_pdf(self):
409 contents = self.make_contents()
410 inside_cover_html = self.compose_inside_cover()
411 log_types(self.dir, self.css_url, self.title, inside_cover_html,
412 self.toc_header, contents, self.title)
414 html = ('<html dir="%s"><head>\n'
415 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
416 '<link rel="stylesheet" href="%s" />\n'
417 '</head>\n<body>\n'
418 '<h1 class="frontpage">%s</h1>'
419 '%s\n'
420 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
421 '<div style="page-break-after: always; color:#fff" class="unseen">.'
422 '<!--%s--></div></body></html>'
423 ) % (self.dir, self.css_url, self.title, inside_cover_html,
424 self.toc_header, contents, self.title)
425 save_data(self.preamble_html_file, html)
427 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
429 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
431 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
432 numbers=self.preamble_page_numbers,
433 number_start=-2)
435 self.notify_watcher()
437 def make_end_matter_pdf(self):
438 """Make an inside back cover and a back cover. If there is an
439 isbn number its barcode will be put on the back cover."""
440 if self.isbn:
441 self.isbn_pdf_file = self.filepath('isbn.pdf')
442 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
443 self.notify_watcher('make_barcode_pdf')
445 end_matter = self.compose_end_matter()
446 #log(end_matter)
447 save_data(self.tail_html_file, end_matter.decode('utf-8'))
448 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
450 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
451 centre_end=True, even_pages=False)
452 self.notify_watcher()
454 def make_book_pdf(self):
455 """A convenient wrapper of a few necessary steps"""
456 # now the Xvfb server is needed. make sure it has had long enough to get going
457 self.wait_for_xvfb()
458 self.make_body_pdf()
459 self.make_preamble_pdf()
460 self.make_end_matter_pdf()
462 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
463 self.body_pdf_file, self.tail_pdf_file,
464 self.isbn_pdf_file)
466 self.notify_watcher('concatenated_pdfs')
468 def make_templated_html(self, template=None, zip=False, index=config.TEMPLATING_INDEX_FIRST):
469 """Make a templated html version of the book."""
470 #set up the directory and static files
471 self.unpack_static()
472 destdir = self.filepath('html')
473 os.mkdir(destdir)
474 os.rename(self.filepath('static'), self.filepath(os.path.join(destdir, 'static')))
476 if not template:
477 template_tree = lxml.html.parse(config.TEMPLATING_DEFAULT_TEMPLATE).getroot()
478 else:
479 template_tree = lxml.html.document_fromstring(template)
481 tocmap = filename_toc_map(self.toc)
482 contents_name, first_name = config.TEMPLATING_INDEX_MODES[index]
484 #build a contents page and a contents menu
485 #We can't make this in the same pass because the menu needs to
486 #go in every page (i.e., into the template)
487 menu = etree.Element('ul', Class=config.TEMPLATING_MENU_ELEMENT)
488 contents = etree.Element('div', Class=config.TEMPLATING_REPLACED_ELEMENT)
490 booktitle = etree.Element('div', Class=config.TEMPLATING_BOOK_TITLE_ELEMENT)
491 log(self.title)
492 booktitle.text = self.title.decode('utf-8')
494 etree.SubElement(contents, 'h1').text = self.title.decode('utf-8')
496 savename = first_name
497 for ID in self.spine:
498 filename = self.manifest[ID]['url']
499 #handle any TOC points in this file.
500 for point in tocmap[filename]:
501 if point['type'] == 'booki-section':
502 etree.SubElement(contents, 'h2').text = point['title']
503 etree.SubElement(menu, 'li', Class='booki-section').text = point['title']
504 else:
505 if savename is None:
506 savename = filename
507 div = etree.SubElement(contents, 'div')
508 etree.SubElement(div, 'a', href=savename).text = point['title']
509 li = etree.SubElement(menu, 'li')
510 li.tail = '\n'
511 etree.SubElement(li, 'a', href=savename).text = point['title']
512 savename = None
513 #put the menu and book title into the template (if it wants it)
514 for e in template_tree.iterdescendants(config.TEMPLATING_MENU_ELEMENT):
515 e.getparent().replace(e, copy.deepcopy(menu))
516 for e in template_tree.iterdescendants(config.TEMPLATING_BOOK_TITLE_ELEMENT):
517 e.getparent().replace(e, copy.deepcopy(booktitle))
519 #function to template content and write to disk
520 def save_content(content, title, filename):
521 if not isinstance(title, unicode):
522 title = title.decode('utf-8')
523 content.set('id', config.TEMPLATING_CONTENTS_ID)
524 content.tag = 'div'
525 dest = copy.deepcopy(template_tree)
526 dest.set('dir', self.dir)
527 for e in dest.iterdescendants(config.TEMPLATING_REPLACED_ELEMENT):
528 #copy only if there are more than 2
529 if content.getparent() is not None:
530 content = copy.deepcopy(content)
531 e.getparent().replace(e, content)
533 chaptertitle = etree.Element('div', Class=config.TEMPLATING_CHAPTER_TITLE_ELEMENT)
534 chaptertitle.text = title
535 for e in template_tree.iterdescendants(config.TEMPLATING_CHAPTER_TITLE_ELEMENT):
536 e.getparent().replace(e, copy.deepcopy(chaptertitle))
537 for e in dest.iterdescendants('title'):
538 #log(type(title), title)
539 e.text = title
540 self.save_tempfile(os.path.join(destdir, filename), lxml.html.tostring(dest))
543 #write the contents to a file. (either index.html or contents.html)
544 save_content(contents, self.title, contents_name)
546 savename = first_name
547 #and now write each chapter to a file
548 for ID in self.spine:
549 filename = self.manifest[ID]['url']
550 try:
551 root = self.get_tree_by_id(ID).getroot()
552 body = root.find('body')
553 except Exception, e:
554 log("hit %s when trying book.get_tree_by_id(%s).getroot().find('body')" % (e, ID))
555 body = etree.Element('body')
557 #handle any TOC points in this file. There should only be one!
558 for point in tocmap[filename]:
559 if point['type'] != 'booki-section':
560 title = point['title']
561 break
562 else:
563 title = self.title
565 if savename is None:
566 savename = filename
567 save_content(body, title, savename)
568 savename = None
569 log(destdir, self.publish_file)
570 os.rename(destdir, self.publish_file)
571 self.notify_watcher()
574 def make_simple_pdf(self, mode):
575 """Make a simple pdf document without contents or separate
576 title page. This is used for multicolumn newspapers and for
577 web-destined pdfs."""
578 self.wait_for_xvfb()
579 #0. Add heading to begining of html
580 body = list(self.tree.cssselect('body'))[0]
581 e = body.makeelement('h1', {'id': 'book-title'})
582 e.text = self.title
583 body.insert(0, e)
584 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
585 e.addnext(intro)
587 #0.5 adjust parameters to suit the particular kind of output
588 if mode == 'web':
589 self.maker.gutter = 0
591 #1. Save the html
592 html_text = etree.tostring(self.tree, method="html")
593 save_data(self.body_html_file, html_text)
595 #2. Make a pdf of it (direct to to final pdf)
596 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True, outline_file=self.outline_file)
597 self.notify_watcher('generate_pdf')
598 n_pages = count_pdf_pages(self.pdf_file)
600 if mode != 'web':
601 #3. resize pages and shift gutters.
602 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
603 self.notify_watcher('reshape_pdf')
605 #4. add page numbers
606 self.maker.number_pdf(self.pdf_file, n_pages,
607 dir=self.dir, numbers=self.page_numbers)
608 self.notify_watcher("number_pdf")
609 self.notify_watcher()
612 def rotate180(self):
613 """Rotate the pdf 180 degrees so an RTL book can print on LTR
614 presses."""
615 rotated = self.filepath('final-rotate.pdf')
616 unrotated = self.filepath('final-pre-rotate.pdf')
617 #leave the unrotated pdf intact at first, in case of error.
618 rotate_pdf(self.pdf_file, rotated)
619 os.rename(self.pdf_file, unrotated)
620 os.rename(rotated, self.pdf_file)
621 self.notify_watcher()
623 def publish_pdf(self):
624 """Move the finished PDF to its final resting place"""
625 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
626 os.rename(self.pdf_file, self.publish_file)
627 self.notify_watcher()
629 def publish_bookizip(self):
630 """Publish the bookizip. For this, copy rather than move,
631 because the bookizip might be used by further processing. If
632 possible, a hard link is created."""
633 log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
634 try:
635 run(['cp', '-l', self.bookizip_file, self.publish_file])
636 except OSError:
637 run(['cp', self.bookizip_file, self.publish_file])
638 self.notify_watcher()
640 def concat_html(self):
641 """Join all the chapters together into one tree. Keep the TOC
642 up-to-date along the way."""
644 #each manifest item looks like:
645 #{'contributors': []
646 #'license': [],
647 #'mimetype': '',
648 #'rightsholders': []
649 #'url': ''}
650 doc = lxml.html.document_fromstring('<html><body></body></html>')
651 tocmap = filename_toc_map(self.toc)
652 for ID in self.spine:
653 details = self.manifest[ID]
654 #log(ID, pformat(details))
655 # ACO MIJENJAO
656 try:
657 root = self.get_tree_by_id(ID).getroot()
658 except Exception, e:
659 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
660 continue
661 #handle any TOC points in this file
662 for point in tocmap[details['url']]:
663 #if the url has a #identifier, use it. Otherwise, make
664 #one up, using a hidden element at the beginning of
665 #the inserted document.
666 #XXX this will break if different files use the same ids
667 #XXX should either replace all, or replace selectively.
668 if point['fragment']:
669 fragment = point['fragment']
670 else:
671 body = _find_tag(root, 'body')
672 fragment = '%s_%s' % (self.cookie, point['index'])
673 #reuse first tag if it is suitable.
674 if (len(body) and
675 body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
676 if body[0].get('id') is None:
677 body[0].set('id', fragment)
678 else:
679 fragment = body[0].get('id')
680 #the chapter starts with a heading. that heading should be the chapter name.
681 if body[0].tag in ('h1', 'h2', 'h3'):
682 #log('chapter has title "%s", found html title "%s"' %
683 # (point['title'], body[0].text_content()))
684 point['html_title'] = body[0].text_content()
685 else:
686 marker = body.makeelement('div', style="display:none",
687 id=fragment)
688 body.insert(0, marker)
689 point['html_id'] = fragment
691 add_guts(root, doc)
692 return doc
694 def unpack_static(self):
695 """Extract static files from the zip for the html to refer to."""
696 static_files = [x['url'] for x in self.manifest.values()
697 if x['url'].startswith('static')]
698 if static_files:
699 os.mkdir(self.filepath('static'))
701 for name in static_files:
702 s = self.store.read(name)
703 f = open(self.filepath(name), 'w')
704 f.write(s)
705 f.close()
706 self.notify_watcher()
708 def load_book(self):
709 """"""
710 #XXX concatenate the HTML to match how TWiki version worked.
711 # This is perhaps foolishly early -- throwing away useful boundaries.
712 self.unpack_static()
713 self.tree = self.concat_html()
714 self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
716 self.headings = [x for x in self.tree.cssselect('h1')]
717 if self.headings:
718 self.headings[0].set('class', "first-heading")
719 for h1 in self.headings:
720 h1.title = h1.text_content().strip()
721 self.notify_watcher()
723 def make_contents(self):
724 """Generate HTML containing the table of contents. This can
725 only be done after the main PDF has been made, because the
726 page numbers are contained in the PDF outline."""
727 header = '<table class="toc">\n'
728 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
729 '<td class="pagenumber">%s</td></tr>\n')
730 empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
731 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
732 footer = '\n</table>'
734 contents = []
736 chapter = 1
737 page_num = 1
738 #log(self.outline_contents)
739 outline_contents = iter(self.outline_contents)
741 for section in self.toc:
742 if not section.get('children'):
743 contents.append(empty_section_tmpl % section['title'])
744 continue
745 contents.append(section_tmpl % section['title'])
747 for point in section['children']:
748 try:
749 level = 99
750 while level > 1:
751 h1_text, level, page_num = outline_contents.next()
752 except StopIteration:
753 log("contents data not found for %s. Stopping" % (point,))
754 break
755 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
756 chapter += 1
758 doc = header + '\n'.join(contents) + footer
759 if isinstance(doc, unicode):
760 doc = doc.encode('utf-8')
761 self.notify_watcher()
762 return doc
764 def add_section_titles(self):
765 """Add any section heading pages that the TOC.txt file
766 specifies. These are sub-book, super-chapter groupings.
768 Also add initial numbers to chapters.
770 chapter = 1
771 section = None
772 #log(self.toc)
773 for t in self.toc:
774 #only top level sections get a subsection page,
775 #and only if they have children.
776 if t.get('children'):
777 section = self.tree.makeelement('div', Class="objavi-subsection")
778 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
779 heading.text = t['title']
780 for child in t['children']:
781 item = etree.SubElement(section, 'div', Class="objavi-chapter")
782 if 'html_title' in child:
783 item.text = child['html_title']
784 heading = self.tree.cssselect('#'+ child['html_id'])
785 if heading:
786 _add_initial_number(heading[0], chapter)
787 else:
788 item.text = child['title']
789 _add_initial_number(item, chapter)
790 log(item.text, debug='HTMLGEN')
791 chapter += 1
792 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
793 location = self.tree.cssselect('#'+ t['html_id'])[0]
794 location.addprevious(section)
797 self.notify_watcher()
800 def add_css(self, css=None, mode='book'):
801 """If css looks like a url, use it as a stylesheet link.
802 Otherwise it is the CSS itself, which is saved to a temporary file
803 and linked to."""
804 log("css is %r" % css)
805 htmltree = self.tree
806 if css is None or not css.strip():
807 css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
808 if css_default is None:
809 #guess from language -- this should come first
810 css_modes = config.LANGUAGE_CSS.get(self.lang,
811 config.LANGUAGE_CSS['en'])
812 css_default = css_modes.get(mode, css_modes[None])
813 url = css_default
814 elif not re.match(r'^http://\S+$', css):
815 url = path2url(self.save_tempfile('objavi.css', css), full=True)
816 else:
817 url = css
819 #find the head -- it's probably first child but lets not assume.
820 for child in htmltree:
821 if child.tag == 'head':
822 head = child
823 break
824 else:
825 head = htmltree.makeelement('head')
826 htmltree.insert(0, head)
828 link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
829 self.css_url = url
830 self.notify_watcher()
831 return url
834 def _read_localised_template(self, template, fallbacks=['en']):
835 """Try to get the template in the approriate language, otherwise in english."""
836 for lang in [self.lang] + fallbacks:
837 try:
838 fn = template % (lang)
839 f = open(fn)
840 break
841 except IOError, e:
842 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
843 log(e)
844 template = f.read()
845 f.close()
846 return template
848 def compose_inside_cover(self):
849 """create the markup for the preamble inside cover."""
850 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
852 if self.isbn:
853 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
854 else:
855 isbn_text = ''
857 return template % {'date': time.strftime('%Y-%m-%d'),
858 'isbn': isbn_text,
859 'license': self.license,
863 def compose_end_matter(self):
864 """create the markup for the end_matter inside cover. If
865 self.isbn is not set, the html will result in a pdf that
866 spills onto two pages.
868 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
870 d = {'css_url': self.css_url,
871 'title': self.title
874 if self.isbn:
875 d['inside_cover_style'] = ''
876 else:
877 d['inside_cover_style'] = 'page-break-after: always'
879 return template % d
882 def make_epub(self, use_cache=False):
883 """Make an epub version of the book, using Mike McCabe's
884 epub module for the Internet Archive."""
885 ebook = ia_epub.Book(self.publish_file, content_dir='')
886 def add_file(ID, filename, mediatype, content):
887 ebook.add_content({'media-type': mediatype.encode('utf-8'),
888 'id': ID.encode('utf-8'),
889 'href': filename.encode('utf-8'),
890 }, content)
892 toc = self.info['TOC']
894 #manifest
895 filemap = {} #map html to corresponding xhtml
896 spinemap = {} #map IDs to multi-file chapters
897 for ID in self.manifest:
898 details = self.manifest[ID]
899 #log(ID, pformat(details))
900 fn, mediatype = details['url'], details['mimetype']
901 content = self.store.read(fn)
902 if mediatype == 'text/html':
903 #convert to application/xhtml+xml, and perhaps split
904 c = EpubChapter(self.server, self.book, ID, content,
905 use_cache=use_cache)
906 c.remove_bad_tags()
907 if fn[-5:] == '.html':
908 fnbase = fn[:-5]
909 else:
910 fnbase = fn
911 fnx = fnbase + '.xhtml'
912 mediatype = 'application/xhtml+xml'
914 fragments = split_html(c.as_xhtml(),
915 compressed_size=self.store.getinfo(fn).compress_size)
917 #add the first one as if it is the whole thing (as it often is)
918 add_file(ID, fnx, mediatype, fragments[0])
919 filemap[fn] = fnx
920 if len(fragments) > 1:
921 spine_ids = [ID]
922 spinemap[ID] = spine_ids
923 #add any extras
924 for i in range(1, len(fragments)):
925 # XXX it is possible for duplicates if another
926 # file happens to have this name. Ignore for now
927 _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
928 spine_ids.append(_id)
929 add_file(_id,
930 '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
931 mediatype, fragments[i])
933 else:
934 add_file(ID, fn, mediatype, content)
936 #toc
937 ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
938 ebook.add(ebook.content_dir + 'toc.ncx', ncx)
940 #spine
941 for ID in self.spine:
942 if ID in spinemap:
943 for x in spinemap[ID]:
944 ebook.add_spine_item({'idref': x})
945 else:
946 ebook.add_spine_item({'idref': ID})
948 #metadata -- no use of attributes (yet)
949 # and fm: metadata disappears for now
950 DCNS = config.DCNS
951 DC = config.DC
952 meta_info_items = []
953 for ns, namespace in self.metadata.items():
954 for keyword, schemes in namespace.items():
955 if ns:
956 keyword = '{%s}%s' % (ns, keyword)
957 for scheme, values in schemes.items():
958 for value in values:
959 item = {
960 'item': keyword,
961 'text': value,
963 if scheme:
964 if keyword in (DCNS + 'creator', DCNS + 'contributor'):
965 item['atts'] = {'role': scheme}
966 else:
967 item['atts'] = {'scheme': scheme}
969 has_authors = 'creator' in self.metadata[DC]
970 if not has_authors and config.CLAIM_UNAUTHORED:
971 authors = []
972 for x in self.metadata[DC]['creator'].values():
973 authors.extend(x)
975 meta_info_items.append({'item': DCNS + 'creator',
976 'text': 'The Contributors'})
978 meta_info_items.append({'item': DCNS + 'rights',
979 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
982 tree_str = ia_epub.make_opf(meta_info_items,
983 ebook.manifest_items,
984 ebook.spine_items,
985 ebook.guide_items,
986 ebook.cover_id)
987 ebook.add(ebook.content_dir + 'content.opf', tree_str)
988 ebook.z.close()
989 self.notify_watcher()
992 def publish_s3(self):
993 """Push the book's epub to archive.org, using S3."""
994 #XXX why only epub?
995 secrets = {}
996 for x in ('S3_SECRET', 'S3_ACCESSKEY'):
997 fn = getattr(config, x)
998 f = open(fn)
999 secrets[x] = f.read().strip()
1000 f.close()
1002 now = time.strftime('%F')
1003 s3output = self.filepath('s3-output.txt')
1004 s3url, detailsurl = find_archive_urls(self.book, self.bookname)
1005 headers = [
1006 'x-amz-auto-make-bucket:1',
1007 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
1008 'x-archive-meta-mediatype:texts',
1009 'x-archive-meta-collection:opensource',
1010 'x-archive-meta-title:%s' % (self.book,),
1011 'x-archive-meta-date:%s' % (now,),
1012 'x-archive-meta-creator:FLOSS Manuals Contributors',
1015 if self.license in config.LICENSES:
1016 headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
1018 argv = ['curl', '--location', '-s', '-o', s3output]
1019 for h in headers:
1020 argv.extend(('--header', h))
1021 argv.extend(('--upload-file', self.publish_file, s3url,))
1023 log(' '.join(repr(x) for x in argv))
1024 check_call(argv, stdout=sys.stderr)
1025 self.notify_watcher()
1026 return detailsurl, s3url
1029 def spawn_x(self):
1030 """Start an Xvfb instance, using a new server number. A
1031 reference to it is stored in self.xvfb, which is used to kill
1032 it when the pdf is done.
1034 Note that Xvfb doesn't interact well with dbus which is
1035 present on modern desktops.
1037 #Find an unused server number (in case two cgis are running at once)
1038 while True:
1039 servernum = random.randrange(50, 500)
1040 if not os.path.exists('/tmp/.X%s-lock' % servernum):
1041 break
1043 self.xserver_no = ':%s' % servernum
1045 authfile = self.filepath('Xauthority')
1046 os.environ['XAUTHORITY'] = authfile
1048 #mcookie(1) eats into /dev/random, so avoid that
1049 from hashlib import md5
1050 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
1051 mcookie = m.hexdigest()
1053 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
1055 self.xvfb = Popen(['Xvfb', self.xserver_no,
1056 '-screen', '0', '1024x768x24',
1057 '-pixdepths', '32',
1058 #'-blackpixel', '0',
1059 #'-whitepixel', str(2 ** 24 -1),
1060 #'+extension', 'Composite',
1061 '-dpi', '96',
1062 #'-kb',
1063 '-nolisten', 'tcp',
1066 # We need to wait a bit before the Xvfb is ready. but the
1067 # downloads are so slow that that probably doesn't matter
1069 self.xvfb_ready_time = time.time() + 2
1071 os.environ['DISPLAY'] = self.xserver_no
1072 log(self.xserver_no)
1074 def wait_for_xvfb(self):
1075 """wait until a previously set time before continuing. This
1076 is so Xvfb has time to properly start."""
1077 if hasattr(self, 'xvfb'):
1078 d = self.xvfb_ready_time - time.time()
1079 if d > 0:
1080 time.sleep(d)
1081 self.notify_watcher()
1083 def cleanup_x(self):
1084 """Try very hard to kill off Xvfb. In addition to killing
1085 this instance's xvfb, occasionally (randomly) search for
1086 escaped Xvfb instances and kill those too."""
1087 if not hasattr(self, 'xvfb'):
1088 return
1089 check_call(['xauth', 'remove', self.xserver_no])
1090 p = self.xvfb
1091 log("trying to kill Xvfb %s" % p.pid)
1092 os.kill(p.pid, 15)
1093 for i in range(10):
1094 if p.poll() is not None:
1095 log("%s died with %s" % (p.pid, p.poll()))
1096 break
1097 log("%s not dead yet" % p.pid)
1098 time.sleep(0.2)
1099 else:
1100 log("Xvfb would not die! kill -9! kill -9!")
1101 try:
1102 os.kill(p.pid, 9)
1103 except OSError, e:
1104 log(e)
1106 if random.random() < 0.1:
1107 # occasionally kill old xvfbs and soffices, if there are any.
1108 self.kill_old_processes()
1110 def kill_old_processes(self):
1111 """Sometimes, despite everything, Xvfb or soffice instances
1112 hang around well after they are wanted -- for example if the
1113 cgi process dies particularly badly. So kill them if they have
1114 been running for a long time."""
1115 log("running kill_old_processes")
1116 killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1117 os.path.basename(config.HTML2ODT),
1118 os.path.basename(config.WKHTMLTOPDF),
1120 p = Popen(['ps', '-C', killable_names,
1121 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
1122 data = p.communicate()[0].strip()
1123 if data:
1124 lines = data.split('\n')
1125 pids = []
1126 for line in lines:
1127 log('dealing with ps output "%s"' % line)
1128 try:
1129 pid, days, hours, minutes, seconds \
1130 = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1131 except AttributeError:
1132 log("Couldn't parse that line!")
1133 # 50 minutes should be enough xvfb time for anyone
1134 if days or hours or int(minutes) > 50:
1135 pid = int(pid)
1136 log("going to kill pid %s" % pid)
1137 os.kill(pid, 15)
1138 pids.append(pid)
1140 time.sleep(1.0)
1141 for pid in pids:
1142 #try again in case any are lingerers
1143 try:
1144 os.kill(int(pid), 9)
1145 except OSError, e:
1146 log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1147 continue
1148 log('killing %s with -9' % pid)
1149 self.notify_watcher()
1151 def cleanup(self):
1152 self.cleanup_x()
1153 if not config.KEEP_TEMP_FILES:
1154 for fn in os.listdir(self.workdir):
1155 os.remove(os.path.join(self.workdir, fn))
1156 os.rmdir(self.workdir)
1157 else:
1158 log("NOT removing '%s', containing the following files:" % self.workdir)
1159 log(*os.listdir(self.workdir))
1161 self.notify_watcher()
1164 def use_cache():
1165 return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1167 def _read_cached_zip(server, book, max_age):
1168 #find a recent zip if possible
1169 prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1170 from glob import glob
1171 zips = sorted(glob(prefix + '*.zip'))
1172 if not zips:
1173 log("no cached booki-zips matching %s*.zip" % (prefix,))
1174 return None
1175 zipname = zips[-1]
1176 cutoff = time.time() - max_age * 60
1177 log(repr(zipname))
1178 try:
1179 date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1180 if date > cutoff:
1181 f = open(zipname)
1182 blob = f.read()
1183 f.close()
1184 return blob, zipname
1185 log("%s is too old, must reload" % zipname)
1186 return None
1187 except (IOError, IndexError, ValueError), e:
1188 log('could not make sense of %s: got exception %s' % (zipname, e))
1189 return None
1192 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1193 interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1194 try:
1195 url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1196 'server': server, 'book':book}
1197 except KeyError:
1198 raise NotImplementedError("Can't handle '%s' interface" % interface)
1200 if use_cache() and max_age < 0:
1201 #default to 12 hours cache on objavi.halo.gen.nz
1202 max_age = 12 * 60
1204 if max_age:
1205 log('WARNING: trying to use cached booki-zip',
1206 'If you are debugging booki-zip creation, you will go CRAZY'
1207 ' unless you switch this off')
1208 blob_and_name = _read_cached_zip(server, book, max_age)
1209 if blob_and_name is not None:
1210 return blob_and_name
1212 log('fetching zip from %s'% url)
1213 f = urlopen(url)
1214 blob = f.read()
1215 f.close()
1216 if save:
1217 if filename is None:
1218 filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1219 make_book_name(book, server, '.zip'))
1220 f = open(filename, 'w')
1221 f.write(blob)
1222 f.close()
1223 return blob, filename
1226 def split_html(html, compressed_size=None, fix_markup=False):
1227 """Split long html files into pieces that will work nicely on a
1228 Sony Reader."""
1229 if compressed_size is None:
1230 import zlib
1231 compressed_size = len(zlib.compress(html))
1233 splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1234 len(html) // config.EPUB_FILE_SIZE_MAX)
1235 log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1237 if not splits:
1238 return [html]
1240 if fix_markup:
1241 #remove '<' in attributes etc, which makes the marker
1242 #insertion more reliable
1243 html = etree.tostring(lxml.html.fromstring(html),
1244 encoding='UTF-8',
1245 #method='html'
1248 target = len(html) // (splits + 1)
1249 s = 0
1250 fragments = []
1251 for i in range(splits):
1252 e = html.find('<', target * (i + 1))
1253 fragments.append(html[s:e])
1254 fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1255 s = e
1256 fragments.append(html[s:])
1258 #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1259 chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1260 return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]