More unicode mangling
[objavi2.git] / objavi / fmbook.py
blob7a976ec3bd46ca6daa2e453258fe23b777001d71
1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
21 PDF"""
23 import os, sys
24 import tempfile
25 import re, time
26 import random
27 from subprocess import Popen, check_call, PIPE
28 from cStringIO import StringIO
29 from urllib2 import urlopen, HTTPError
30 import zipfile
31 import traceback
32 from string import ascii_letters
33 from pprint import pformat
35 try:
36 import simplejson as json
37 except ImportError:
38 import json
40 import lxml.html
41 from lxml import etree
43 from objavi import config, epub_utils
44 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
45 from objavi.book_utils import ObjaviError, log_types
46 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline, parse_extracted_outline
47 from objavi.epub import add_guts, _find_tag
48 from objavi.xhtml_utils import EpubChapter, split_tree
49 from objavi.cgi_utils import url2path
51 from iarchive import epub as ia_epub
52 from booki.bookizip import get_metadata, add_metadata
54 TMPDIR = os.path.abspath(config.TMPDIR)
55 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
56 HTTP_HOST = os.environ.get('HTTP_HOST', '')
58 def find_archive_urls(bookid, bookname):
59 s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
60 detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
61 return (s3url, detailsurl)
63 def _get_best_title(tocpoint):
64 if 'html_title' in tocpoint:
65 return tocpoint['html_title']
66 if 'title' in tocpoint:
67 return tocpoint['title']
68 return 'Untitled'
71 def _add_initial_number(e, n):
72 """Put a styled chapter number n at the beginning of element e."""
73 initial = e.makeelement("strong", Class="initial")
74 e.insert(0, initial)
75 initial.tail = ' '
76 if e.text is not None:
77 initial.tail += e.text
78 e.text = ''
79 initial.text = "%s." % n
81 def expand_toc(toc, depth=1, index=0):
82 """Reformat toc slightly for convenience"""
83 for item in toc:
84 url = item['url'].lstrip('/')
85 bits = url.split('#', 1)
86 filename = bits[0]
87 fragment = (bits[1] if len(bits) == 2 else None)
88 item['depth'] = depth
89 item["filename"] = filename
90 item["fragment"] = fragment
91 item["index"] = index
92 index += 1
93 if 'children' in item:
94 index = expand_toc(item['children'], depth + 1, index)
95 return index
97 def _serialise(rtoc, stoc, depth):
98 for item in rtoc:
99 url = item['url'].lstrip('/')
100 bits = url.split('#', 1)
101 filename = bits[0]
102 fragment = (bits[1] if len(bits) == 2 else None)
103 stoc.append({"depth": depth,
104 "title": item['title'],
105 "url": url,
106 "filename": filename,
107 "fragment": fragment,
108 "type": item['type']
110 if 'children' in item:
111 _serialise(item['children'], stoc, depth + 1)
114 def serialise_toc(rtoc):
115 """Take the recursive TOC structure and turn it into a list of
116 serial points. Reformat some things for convenience."""
117 stoc = []
118 _serialise(rtoc, stoc, 1)
119 for i, x in enumerate(stoc):
120 x['position'] = i
121 return stoc
123 def filename_toc_map(rtoc):
124 tocmap = {}
125 #log(rtoc)
126 def traverse(toc):
127 for point in toc:
128 #log(point.keys())
129 tocmap.setdefault(point['filename'], []).append(point)
130 if 'children' in point:
131 traverse(point['children'])
132 traverse(rtoc)
133 return tocmap
135 def save_data(fn, data):
136 """Save without tripping up on unicode"""
137 if isinstance(data, unicode):
138 data = data.encode('utf8', 'ignore')
139 f = open(fn, 'w')
140 f.write(data)
141 f.close()
144 class Book(object):
145 page_numbers = 'latin'
146 preamble_page_numbers = 'roman'
148 def notify_watcher(self, message=None):
149 if self.watchers:
150 if message is None:
151 #message is the name of the caller
152 message = traceback.extract_stack(None, 2)[0][2]
153 log("notify_watcher called with '%s'" % message)
154 for w in self.watchers:
155 w(message)
157 def __enter__(self):
158 return self
160 def __exit__(self, exc_type, exc_value, tb):
161 self.notify_watcher(config.FINISHED_MESSAGE)
162 self.cleanup()
163 #could deal with exceptions here and return true
166 def __init__(self, book, server, bookname,
167 page_settings=None, watchers=None, isbn=None,
168 license=config.DEFAULT_LICENSE, title=None,
169 max_age=0):
170 log("*** Starting new book %s ***" % bookname)
171 self.watchers = set()
172 if watchers is not None:
173 self.watchers.update(watchers)
174 self.notify_watcher('start')
175 self.bookname = bookname
176 self.book = book
177 self.server = server
178 self.cookie = ''.join(random.sample(ascii_letters, 10))
179 try:
180 blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
181 except HTTPError, e:
182 traceback.print_exc()
183 self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
184 #not much to do?
185 #raise 502 Bad Gateway ?
186 sys.exit()
187 f = StringIO(blob)
188 self.notify_watcher('fetch_zip')
189 self.store = zipfile.ZipFile(f, 'r')
190 self.info = json.loads(self.store.read('info.json'))
191 for k in ('manifest', 'metadata', 'spine', 'TOC'):
192 if k not in self.info:
193 raise ObjaviError('info.json of %s lacks vital element "%s"' %
194 (bookname, k))
195 #check types also?
197 self.metadata = self.info['metadata']
198 self.spine = self.info['spine']
199 self.manifest = self.info['manifest']
201 if server == config.LOCALHOST: # [DEPRECATED]
202 server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
203 book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
205 log(pformat(self.metadata))
206 self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
207 if not self.lang:
208 self.lang = guess_lang(server, book)
209 log('guessed lang as %s' % self.lang)
211 self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
212 if not self.toc_header:
213 self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
215 self.dir = str(get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0])
216 if not self.dir:
217 self.dir = guess_text_dir(server, book)
219 #Patch in the extra metadata. (lang and dir may be set from config)
220 #these should be read from zip -- so should go into zip?
221 for var, key, scheme, ns in (
222 (isbn, 'id', 'ISBN', config.DC),
223 (license, 'rights', 'License', config.DC),
224 (title, 'title', '', config.DC),
225 (self.lang, 'language', '', config.DC),
226 (self.dir, 'dir', '', config.FM),
228 if var is not None:
229 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
231 self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
232 self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
234 self.toc = self.info['TOC']
235 expand_toc(self.toc)
237 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
238 os.chmod(self.workdir, 0755)
240 self.body_html_file = self.filepath('body.html')
241 self.body_pdf_file = self.filepath('body.pdf')
242 self.preamble_html_file = self.filepath('preamble.html')
243 self.preamble_pdf_file = self.filepath('preamble.pdf')
244 self.tail_html_file = self.filepath('tail.html')
245 self.tail_pdf_file = self.filepath('tail.pdf')
246 self.isbn_pdf_file = None
247 self.pdf_file = self.filepath('final.pdf')
248 self.body_odt_file = self.filepath('body.odt')
249 self.outline_file = self.filepath('outline.txt')
251 self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
253 if page_settings is not None:
254 self.maker = PageSettings(**page_settings)
256 if title is not None:
257 self.title = title
258 else:
259 titles = get_metadata(self.metadata, 'title')
260 if titles:
261 self.title = titles[0]
262 else:
263 self.title = 'A Book About ' + self.book
264 if isinstance(self.title, unicode):
265 self.title = self.title.encode('utf-8')
267 self.notify_watcher()
270 if config.TRY_BOOK_CLEANUP_ON_DEL:
271 #Dont even define __del__ if it is not used.
272 _try_cleanup_on_del = True
273 def __del__(self):
274 if self._try_cleanup_on_del and os.path.exists(self.workdir):
275 self._try_cleanup_on_del = False #or else you can get in bad cycles
276 self.cleanup()
278 def get_tree_by_id(self, id):
279 """get an HTML tree from the given manifest ID"""
280 name = self.manifest[id]['url']
281 mimetype = self.manifest[id]['mimetype']
282 s = self.store.read(name)
283 f = StringIO(s)
284 if mimetype == 'text/html':
285 try:
286 tree = lxml.html.parse(f)
287 except etree.XMLSyntaxError, e:
288 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
289 (id, name, s[:20], e))
290 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
291 elif 'xml' in mimetype: #XXX or is this just asking for trouble?
292 tree = etree.parse(f)
293 else:
294 tree = f.read()
295 f.close()
296 return tree
298 def filepath(self, fn):
299 return os.path.join(self.workdir, fn)
301 def save_tempfile(self, fn, data):
302 """Save the data in a temporary directory that will be cleaned
303 up when all is done. Return the absolute file path."""
304 fn = self.filepath(fn)
305 save_data(fn, data)
306 return fn
308 def make_oo_doc(self):
309 """Make an openoffice document, using the html2odt script."""
310 self.wait_for_xvfb()
311 html_text = etree.tostring(self.tree, method="html")
312 save_data(self.body_html_file, html_text)
313 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
314 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
315 os.rename(self.body_odt_file, self.publish_file)
316 self.notify_watcher()
318 def extract_pdf_outline(self):
319 """Get the outline (table of contents) for the PDF, which
320 wkhtmltopdf should have written to a file. If that file
321 doesn't exist (or config says not to use it), fall back to
322 using self._extract_pdf_outline_the_old_way, below.
324 if config.USE_DUMP_OUTLINE:
325 try:
326 self.outline_contents, number_of_pages = \
327 parse_extracted_outline(self.outline_file)
329 except Exception, e:
330 traceback.print_exc()
331 number_of_pages = self._extract_pdf_outline_the_old_way()
332 else:
333 number_of_pages = self._extract_pdf_outline_the_old_way()
335 self.notify_watcher()
336 return number_of_pages
338 def _extract_pdf_outline_the_old_way(self):
339 """Try to get the PDF outline using pdftk. This doesn't work
340 well with all scripts."""
341 debugf = self.filepath('extracted-outline.txt')
342 self.outline_contents, number_of_pages = \
343 parse_outline(self.body_pdf_file, 1, debugf)
345 if not self.outline_contents:
346 #probably problems with international text. need a horrible hack
347 log('no outline: trying again with ascii headings')
348 import copy
349 tree = copy.deepcopy(self.tree)
350 titlemap = {}
351 for tag in ('h1', 'h2', 'h3', 'h4'):
352 for i, e in enumerate(tree.getiterator(tag)):
353 key = "%s_%s" % (tag, i)
354 titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
355 del e[:]
356 if tag == 'h1':
357 e = lxml.etree.SubElement(e, "strong", Class="initial")
358 e.text = key
359 log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
361 ascii_html_file = self.filepath('body-ascii-headings.html')
362 ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
363 html_text = lxml.etree.tostring(tree, method="html")
364 save_data(ascii_html_file, html_text)
365 self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
366 debugf = self.filepath('ascii-extracted-outline.txt')
367 ascii_contents, number_of_ascii_pages = \
368 parse_outline(ascii_pdf_file, 1, debugf)
369 self.outline_contents = []
370 log ("number of pages: %s, post ascii: %s" %
371 (number_of_pages, number_of_ascii_pages))
372 for ascii_title, depth, pageno in ascii_contents:
373 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
374 ascii_title = ascii_title[:-4]
375 if ' ' in ascii_title:
376 ascii_title = ascii_title.rsplit(' ', 1)[1]
377 title = titlemap.get(ascii_title, '')
378 log((ascii_title, title, depth, pageno))
380 self.outline_contents.append((title, depth, pageno))
382 return number_of_pages
384 def make_body_pdf(self):
385 """Make a pdf of the HTML, using webkit"""
386 #1. Save the html
387 html_text = etree.tostring(self.tree, method="html")
388 save_data(self.body_html_file, html_text)
390 #2. Make a pdf of it
391 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True, outline_file=self.outline_file)
392 self.notify_watcher('generate_pdf')
394 n_pages = self.extract_pdf_outline()
396 log ("found %s pages in pdf" % n_pages)
397 #4. resize pages, shift gutters, even pages
398 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
399 self.notify_watcher('reshape_pdf')
401 #5 add page numbers
402 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
403 numbers=self.page_numbers)
404 self.notify_watcher("number_pdf")
405 self.notify_watcher()
407 def make_preamble_pdf(self):
408 contents = self.make_contents()
409 inside_cover_html = self.compose_inside_cover()
410 log_types(self.dir, self.css_url, self.title, inside_cover_html,
411 self.toc_header, contents, self.title)
413 html = ('<html dir="%s"><head>\n'
414 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
415 '<link rel="stylesheet" href="%s" />\n'
416 '</head>\n<body>\n'
417 '<h1 class="frontpage">%s</h1>'
418 '%s\n'
419 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
420 '<div style="page-break-after: always; color:#fff" class="unseen">.'
421 '<!--%s--></div></body></html>'
422 ) % (self.dir, self.css_url, self.title, inside_cover_html,
423 self.toc_header, contents, self.title)
424 save_data(self.preamble_html_file, html)
426 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
428 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
430 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
431 numbers=self.preamble_page_numbers,
432 number_start=-2)
434 self.notify_watcher()
436 def make_end_matter_pdf(self):
437 """Make an inside back cover and a back cover. If there is an
438 isbn number its barcode will be put on the back cover."""
439 if self.isbn:
440 self.isbn_pdf_file = self.filepath('isbn.pdf')
441 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
442 self.notify_watcher('make_barcode_pdf')
444 end_matter = self.compose_end_matter()
445 #log(end_matter)
446 save_data(self.tail_html_file, end_matter.decode('utf-8'))
447 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
449 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
450 centre_end=True, even_pages=False)
451 self.notify_watcher()
453 def make_book_pdf(self):
454 """A convenient wrapper of a few necessary steps"""
455 # now the Xvfb server is needed. make sure it has had long enough to get going
456 self.wait_for_xvfb()
457 self.make_body_pdf()
458 self.make_preamble_pdf()
459 self.make_end_matter_pdf()
461 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
462 self.body_pdf_file, self.tail_pdf_file,
463 self.isbn_pdf_file)
465 self.notify_watcher('concatenated_pdfs')
468 def make_simple_pdf(self, mode):
469 """Make a simple pdf document without contents or separate
470 title page. This is used for multicolumn newspapers and for
471 web-destined pdfs."""
472 self.wait_for_xvfb()
473 #0. Add heading to begining of html
474 body = list(self.tree.cssselect('body'))[0]
475 e = body.makeelement('h1', {'id': 'book-title'})
476 e.text = self.title
477 body.insert(0, e)
478 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
479 e.addnext(intro)
481 #0.5 adjust parameters to suit the particular kind of output
482 if mode == 'web':
483 self.maker.gutter = 0
485 #1. Save the html
486 html_text = etree.tostring(self.tree, method="html")
487 save_data(self.body_html_file, html_text)
489 #2. Make a pdf of it (direct to to final pdf)
490 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True, outline_file=self.outline_file)
491 self.notify_watcher('generate_pdf')
492 n_pages = count_pdf_pages(self.pdf_file)
494 if mode != 'web':
495 #3. resize pages and shift gutters.
496 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
497 self.notify_watcher('reshape_pdf')
499 #4. add page numbers
500 self.maker.number_pdf(self.pdf_file, n_pages,
501 dir=self.dir, numbers=self.page_numbers)
502 self.notify_watcher("number_pdf")
503 self.notify_watcher()
506 def rotate180(self):
507 """Rotate the pdf 180 degrees so an RTL book can print on LTR
508 presses."""
509 rotated = self.filepath('final-rotate.pdf')
510 unrotated = self.filepath('final-pre-rotate.pdf')
511 #leave the unrotated pdf intact at first, in case of error.
512 rotate_pdf(self.pdf_file, rotated)
513 os.rename(self.pdf_file, unrotated)
514 os.rename(rotated, self.pdf_file)
515 self.notify_watcher()
517 def publish_pdf(self):
518 """Move the finished PDF to its final resting place"""
519 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
520 os.rename(self.pdf_file, self.publish_file)
521 self.notify_watcher()
523 def publish_bookizip(self):
524 """Publish the bookizip. For this, copy rather than move,
525 because the bookizip might be used by further processing. If
526 possible, a hard link is created."""
527 log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
528 try:
529 run(['cp', '-l', self.bookizip_file, self.publish_file])
530 except OSError:
531 run(['cp', self.bookizip_file, self.publish_file])
532 self.notify_watcher()
534 def concat_html(self):
535 """Join all the chapters together into one tree. Keep the TOC
536 up-to-date along the way."""
538 #each manifest item looks like:
539 #{'contributors': []
540 #'license': [],
541 #'mimetype': '',
542 #'rightsholders': []
543 #'url': ''}
544 doc = lxml.html.document_fromstring('<html><body></body></html>')
545 tocmap = filename_toc_map(self.toc)
546 for ID in self.spine:
547 details = self.manifest[ID]
548 #log(ID, pformat(details))
549 # ACO MIJENJAO
550 try:
551 root = self.get_tree_by_id(ID).getroot()
552 except Exception, e:
553 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
554 continue
555 #handle any TOC points in this file
556 for point in tocmap[details['url']]:
557 #if the url has a #identifier, use it. Otherwise, make
558 #one up, using a hidden element at the beginning of
559 #the inserted document.
560 #XXX this will break if different files use the same ids
561 #XXX should either replace all, or replace selectively.
562 if point['fragment']:
563 fragment = point['fragment']
564 else:
565 body = _find_tag(root, 'body')
566 fragment = '%s_%s' % (self.cookie, point['index'])
567 #reuse first tag if it is suitable.
568 if (len(body) and
569 body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
570 if body[0].get('id') is None:
571 body[0].set('id', fragment)
572 else:
573 fragment = body[0].get('id')
574 #the chapter starts with a heading. that heading should be the chapter name.
575 if body[0].tag in ('h1', 'h2', 'h3'):
576 #log('chapter has title "%s", found html title "%s"' %
577 # (point['title'], body[0].text_content()))
578 point['html_title'] = body[0].text_content()
579 else:
580 marker = body.makeelement('div', style="display:none",
581 id=fragment)
582 body.insert(0, marker)
583 point['html_id'] = fragment
585 add_guts(root, doc)
586 return doc
588 def unpack_static(self):
589 """Extract static files from the zip for the html to refer to."""
590 static_files = [x['url'] for x in self.manifest.values()
591 if x['url'].startswith('static')]
592 if static_files:
593 os.mkdir(self.filepath('static'))
595 for name in static_files:
596 s = self.store.read(name)
597 f = open(self.filepath(name), 'w')
598 f.write(s)
599 f.close()
600 self.notify_watcher()
602 def load_book(self):
603 """"""
604 #XXX concatenate the HTML to match how TWiki version worked.
605 # This is perhaps foolishly early -- throwing away useful boundaries.
606 self.unpack_static()
607 self.tree = self.concat_html()
608 self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
610 self.headings = [x for x in self.tree.cssselect('h1')]
611 if self.headings:
612 self.headings[0].set('class', "first-heading")
613 for h1 in self.headings:
614 h1.title = h1.text_content().strip()
615 self.notify_watcher()
617 def make_contents(self):
618 """Generate HTML containing the table of contents. This can
619 only be done after the main PDF has been made, because the
620 page numbers are contained in the PDF outline."""
621 header = '<table class="toc">\n'
622 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
623 '<td class="pagenumber">%s</td></tr>\n')
624 empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
625 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
626 footer = '\n</table>'
628 contents = []
630 chapter = 1
631 page_num = 1
632 #log(self.outline_contents)
633 outline_contents = iter(self.outline_contents)
635 for section in self.toc:
636 if not section.get('children'):
637 contents.append(empty_section_tmpl % section['title'])
638 continue
639 contents.append(section_tmpl % section['title'])
641 for point in section['children']:
642 try:
643 level = 99
644 while level > 1:
645 h1_text, level, page_num = outline_contents.next()
646 except StopIteration:
647 log("contents data not found for %s. Stopping" % (point,))
648 break
649 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
650 chapter += 1
652 doc = header + '\n'.join(contents) + footer
653 if isinstance(doc, unicode):
654 doc = doc.encode('utf-8')
655 self.notify_watcher()
656 return doc
658 def add_section_titles(self):
659 """Add any section heading pages that the TOC.txt file
660 specifies. These are sub-book, super-chapter groupings.
662 Also add initial numbers to chapters.
664 chapter = 1
665 section = None
666 #log(self.toc)
667 for t in self.toc:
668 #only top level sections get a subsection page,
669 #and only if they have children.
670 if t.get('children'):
671 section = self.tree.makeelement('div', Class="objavi-subsection")
672 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
673 heading.text = t['title']
674 for child in t['children']:
675 item = etree.SubElement(section, 'div', Class="objavi-chapter")
676 if 'html_title' in child:
677 item.text = child['html_title']
678 heading = self.tree.cssselect('#'+ child['html_id'])
679 if heading:
680 _add_initial_number(heading[0], chapter)
681 else:
682 item.text = child['title']
683 _add_initial_number(item, chapter)
684 log(item.text, debug='HTMLGEN')
685 chapter += 1
686 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
687 location = self.tree.cssselect('#'+ t['html_id'])[0]
688 location.addprevious(section)
691 self.notify_watcher()
694 def add_css(self, css=None, mode='book'):
695 """If css looks like a url, use it as a stylesheet link.
696 Otherwise it is the CSS itself, which is saved to a temporary file
697 and linked to."""
698 log("css is %r" % css)
699 htmltree = self.tree
700 if css is None or not css.strip():
701 css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
702 if css_default is None:
703 #guess from language -- this should come first
704 css_modes = config.LANGUAGE_CSS.get(self.lang,
705 config.LANGUAGE_CSS['en'])
706 css_default = css_modes.get(mode, css_modes[None])
707 url = 'file://' + os.path.abspath(url2path(css_default))
708 elif not re.match(r'^http://\S+$', css):
709 fn = self.save_tempfile('objavi.css', css)
710 url = 'file://' + fn
711 else:
712 url = css
714 #find the head -- it's probably first child but lets not assume.
715 for child in htmltree:
716 if child.tag == 'head':
717 head = child
718 break
719 else:
720 head = htmltree.makeelement('head')
721 htmltree.insert(0, head)
723 link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
724 self.css_url = url
725 self.notify_watcher()
726 return url
729 def _read_localised_template(self, template, fallbacks=['en']):
730 """Try to get the template in the approriate language, otherwise in english."""
731 for lang in [self.lang] + fallbacks:
732 try:
733 fn = template % (lang)
734 f = open(fn)
735 break
736 except IOError, e:
737 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
738 log(e)
739 template = f.read()
740 f.close()
741 return template
743 def compose_inside_cover(self):
744 """create the markup for the preamble inside cover."""
745 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
747 if self.isbn:
748 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
749 else:
750 isbn_text = ''
752 return template % {'date': time.strftime('%Y-%m-%d'),
753 'isbn': isbn_text,
754 'license': self.license,
758 def compose_end_matter(self):
759 """create the markup for the end_matter inside cover. If
760 self.isbn is not set, the html will result in a pdf that
761 spills onto two pages.
763 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
765 d = {'css_url': self.css_url,
766 'title': self.title
769 if self.isbn:
770 d['inside_cover_style'] = ''
771 else:
772 d['inside_cover_style'] = 'page-break-after: always'
774 return template % d
777 def make_epub(self, use_cache=False):
778 """Make an epub version of the book, using Mike McCabe's
779 epub module for the Internet Archive."""
780 ebook = ia_epub.Book(self.publish_file, content_dir='')
781 def add_file(ID, filename, mediatype, content):
782 ebook.add_content({'media-type': mediatype.encode('utf-8'),
783 'id': ID.encode('utf-8'),
784 'href': filename.encode('utf-8'),
785 }, content)
787 toc = self.info['TOC']
789 #manifest
790 filemap = {} #map html to corresponding xhtml
791 spinemap = {} #map IDs to multi-file chapters
792 for ID in self.manifest:
793 details = self.manifest[ID]
794 #log(ID, pformat(details))
795 fn, mediatype = details['url'], details['mimetype']
796 content = self.store.read(fn)
797 if mediatype == 'text/html':
798 #convert to application/xhtml+xml, and perhaps split
799 c = EpubChapter(self.server, self.book, ID, content,
800 use_cache=use_cache)
801 c.remove_bad_tags()
802 if fn[-5:] == '.html':
803 fnbase = fn[:-5]
804 else:
805 fnbase = fn
806 fnx = fnbase + '.xhtml'
807 mediatype = 'application/xhtml+xml'
809 fragments = split_html(c.as_xhtml(),
810 compressed_size=self.store.getinfo(fn).compress_size)
812 #add the first one as if it is the whole thing (as it often is)
813 add_file(ID, fnx, mediatype, fragments[0])
814 filemap[fn] = fnx
815 if len(fragments) > 1:
816 spine_ids = [ID]
817 spinemap[ID] = spine_ids
818 #add any extras
819 for i in range(1, len(fragments)):
820 # XXX it is possible for duplicates if another
821 # file happens to have this name. Ignore for now
822 _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
823 spine_ids.append(_id)
824 add_file(_id,
825 '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
826 mediatype, fragments[i])
828 else:
829 add_file(ID, fn, mediatype, content)
831 #toc
832 ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
833 ebook.add(ebook.content_dir + 'toc.ncx', ncx)
835 #spine
836 for ID in self.spine:
837 if ID in spinemap:
838 for x in spinemap[ID]:
839 ebook.add_spine_item({'idref': x})
840 else:
841 ebook.add_spine_item({'idref': ID})
843 #metadata -- no use of attributes (yet)
844 # and fm: metadata disappears for now
845 DCNS = config.DCNS
846 DC = config.DC
847 meta_info_items = []
848 for ns, namespace in self.metadata.items():
849 for keyword, schemes in namespace.items():
850 if ns:
851 keyword = '{%s}%s' % (ns, keyword)
852 for scheme, values in schemes.items():
853 for value in values:
854 item = {
855 'item': keyword,
856 'text': value,
858 if scheme:
859 if keyword in (DCNS + 'creator', DCNS + 'contributor'):
860 item['atts'] = {'role': scheme}
861 else:
862 item['atts'] = {'scheme': scheme}
864 has_authors = 'creator' in self.metadata[DC]
865 if not has_authors and config.CLAIM_UNAUTHORED:
866 authors = []
867 for x in self.metadata[DC]['creator'].values():
868 authors.extend(x)
870 meta_info_items.append({'item': DCNS + 'creator',
871 'text': 'The Contributors'})
873 meta_info_items.append({'item': DCNS + 'rights',
874 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
877 tree_str = ia_epub.make_opf(meta_info_items,
878 ebook.manifest_items,
879 ebook.spine_items,
880 ebook.guide_items,
881 ebook.cover_id)
882 ebook.add(ebook.content_dir + 'content.opf', tree_str)
883 ebook.z.close()
884 self.notify_watcher()
887 def publish_s3(self):
888 """Push the book's epub to archive.org, using S3."""
889 #XXX why only epub?
890 secrets = {}
891 for x in ('S3_SECRET', 'S3_ACCESSKEY'):
892 fn = getattr(config, x)
893 f = open(fn)
894 secrets[x] = f.read().strip()
895 f.close()
897 now = time.strftime('%F')
898 s3output = self.filepath('s3-output.txt')
899 s3url, detailsurl = find_archive_urls(self.book, self.bookname)
900 headers = [
901 'x-amz-auto-make-bucket:1',
902 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
903 'x-archive-meta-mediatype:texts',
904 'x-archive-meta-collection:opensource',
905 'x-archive-meta-title:%s' % (self.book,),
906 'x-archive-meta-date:%s' % (now,),
907 'x-archive-meta-creator:FLOSS Manuals Contributors',
910 if self.license in config.LICENSES:
911 headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
913 argv = ['curl', '--location', '-s', '-o', s3output]
914 for h in headers:
915 argv.extend(('--header', h))
916 argv.extend(('--upload-file', self.publish_file, s3url,))
918 log(' '.join(repr(x) for x in argv))
919 check_call(argv, stdout=sys.stderr)
920 self.notify_watcher()
921 return detailsurl, s3url
924 def spawn_x(self):
925 """Start an Xvfb instance, using a new server number. A
926 reference to it is stored in self.xvfb, which is used to kill
927 it when the pdf is done.
929 Note that Xvfb doesn't interact well with dbus which is
930 present on modern desktops.
932 #Find an unused server number (in case two cgis are running at once)
933 while True:
934 servernum = random.randrange(50, 500)
935 if not os.path.exists('/tmp/.X%s-lock' % servernum):
936 break
938 self.xserver_no = ':%s' % servernum
940 authfile = self.filepath('Xauthority')
941 os.environ['XAUTHORITY'] = authfile
943 #mcookie(1) eats into /dev/random, so avoid that
944 from hashlib import md5
945 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
946 mcookie = m.hexdigest()
948 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
950 self.xvfb = Popen(['Xvfb', self.xserver_no,
951 '-screen', '0', '1024x768x24',
952 '-pixdepths', '32',
953 #'-blackpixel', '0',
954 #'-whitepixel', str(2 ** 24 -1),
955 #'+extension', 'Composite',
956 '-dpi', '96',
957 #'-kb',
958 '-nolisten', 'tcp',
961 # We need to wait a bit before the Xvfb is ready. but the
962 # downloads are so slow that that probably doesn't matter
964 self.xvfb_ready_time = time.time() + 2
966 os.environ['DISPLAY'] = self.xserver_no
967 log(self.xserver_no)
969 def wait_for_xvfb(self):
970 """wait until a previously set time before continuing. This
971 is so Xvfb has time to properly start."""
972 if hasattr(self, 'xvfb'):
973 d = self.xvfb_ready_time - time.time()
974 if d > 0:
975 time.sleep(d)
976 self.notify_watcher()
978 def cleanup_x(self):
979 """Try very hard to kill off Xvfb. In addition to killing
980 this instance's xvfb, occasionally (randomly) search for
981 escaped Xvfb instances and kill those too."""
982 if not hasattr(self, 'xvfb'):
983 return
984 check_call(['xauth', 'remove', self.xserver_no])
985 p = self.xvfb
986 log("trying to kill Xvfb %s" % p.pid)
987 os.kill(p.pid, 15)
988 for i in range(10):
989 if p.poll() is not None:
990 log("%s died with %s" % (p.pid, p.poll()))
991 break
992 log("%s not dead yet" % p.pid)
993 time.sleep(0.2)
994 else:
995 log("Xvfb would not die! kill -9! kill -9!")
996 try:
997 os.kill(p.pid, 9)
998 except OSError, e:
999 log(e)
1001 if random.random() < 0.1:
1002 # occasionally kill old xvfbs and soffices, if there are any.
1003 self.kill_old_processes()
1005 def kill_old_processes(self):
1006 """Sometimes, despite everything, Xvfb or soffice instances
1007 hang around well after they are wanted -- for example if the
1008 cgi process dies particularly badly. So kill them if they have
1009 been running for a long time."""
1010 log("running kill_old_processes")
1011 killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1012 os.path.basename(config.HTML2ODT),
1013 os.path.basename(config.WKHTMLTOPDF),
1015 p = Popen(['ps', '-C', killable_names,
1016 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
1017 data = p.communicate()[0].strip()
1018 if data:
1019 lines = data.split('\n')
1020 pids = []
1021 for line in lines:
1022 log('dealing with ps output "%s"' % line)
1023 try:
1024 pid, days, hours, minutes, seconds \
1025 = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1026 except AttributeError:
1027 log("Couldn't parse that line!")
1028 # 50 minutes should be enough xvfb time for anyone
1029 if days or hours or int(minutes) > 50:
1030 pid = int(pid)
1031 log("going to kill pid %s" % pid)
1032 os.kill(pid, 15)
1033 pids.append(pid)
1035 time.sleep(1.0)
1036 for pid in pids:
1037 #try again in case any are lingerers
1038 try:
1039 os.kill(int(pid), 9)
1040 except OSError, e:
1041 log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1042 continue
1043 log('killing %s with -9' % pid)
1044 self.notify_watcher()
1046 def cleanup(self):
1047 self.cleanup_x()
1048 if not config.KEEP_TEMP_FILES:
1049 for fn in os.listdir(self.workdir):
1050 os.remove(os.path.join(self.workdir, fn))
1051 os.rmdir(self.workdir)
1052 else:
1053 log("NOT removing '%s', containing the following files:" % self.workdir)
1054 log(*os.listdir(self.workdir))
1056 self.notify_watcher()
1059 def use_cache():
1060 return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1062 def _read_cached_zip(server, book, max_age):
1063 #find a recent zip if possible
1064 prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1065 from glob import glob
1066 zips = sorted(glob(prefix + '*.zip'))
1067 if not zips:
1068 log("no cached booki-zips matching %s*.zip" % (prefix,))
1069 return None
1070 zipname = zips[-1]
1071 cutoff = time.time() - max_age * 60
1072 log(repr(zipname))
1073 try:
1074 date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1075 if date > cutoff:
1076 f = open(zipname)
1077 blob = f.read()
1078 f.close()
1079 return blob, zipname
1080 log("%s is too old, must reload" % zipname)
1081 return None
1082 except (IOError, IndexError, ValueError), e:
1083 log('could not make sense of %s: got exception %s' % (zipname, e))
1084 return None
1087 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1088 interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1089 try:
1090 url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1091 'server': server, 'book':book}
1092 except KeyError:
1093 raise NotImplementedError("Can't handle '%s' interface" % interface)
1095 if use_cache() and max_age < 0:
1096 #default to 12 hours cache on objavi.halo.gen.nz
1097 max_age = 12 * 60
1099 if max_age:
1100 log('WARNING: trying to use cached booki-zip',
1101 'If you are debugging booki-zip creation, you will go CRAZY'
1102 ' unless you switch this off')
1103 blob_and_name = _read_cached_zip(server, book, max_age)
1104 if blob_and_name is not None:
1105 return blob_and_name
1107 log('fetching zip from %s'% url)
1108 f = urlopen(url)
1109 blob = f.read()
1110 f.close()
1111 if save:
1112 if filename is None:
1113 filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1114 make_book_name(book, server, '.zip'))
1115 f = open(filename, 'w')
1116 f.write(blob)
1117 f.close()
1118 return blob, filename
1121 def split_html(html, compressed_size=None, fix_markup=False):
1122 """Split long html files into pieces that will work nicely on a
1123 Sony Reader."""
1124 if compressed_size is None:
1125 import zlib
1126 compressed_size = len(zlib.compress(html))
1128 splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1129 len(html) // config.EPUB_FILE_SIZE_MAX)
1130 log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1132 if not splits:
1133 return [html]
1135 if fix_markup:
1136 #remove '<' in attributes etc, which makes the marker
1137 #insertion more reliable
1138 html = etree.tostring(lxml.html.fromstring(html),
1139 encoding='UTF-8',
1140 #method='html'
1143 target = len(html) // (splits + 1)
1144 s = 0
1145 fragments = []
1146 for i in range(splits):
1147 e = html.find('<', target * (i + 1))
1148 fragments.append(html[s:e])
1149 fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1150 s = e
1151 fragments.append(html[s:])
1153 #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1154 chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1155 return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]