Clean-ups leading from shift to separate htdocs directory
[objavi2.git] / objavi / fmbook.py
blob6aa1dc66076c2d4c52df7894183af3644bd8bb78
1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
21 PDF"""
23 import os, sys
24 import tempfile
25 import re, time
26 import random
27 from subprocess import Popen, check_call, PIPE
28 from cStringIO import StringIO
29 from urllib2 import urlopen, HTTPError
30 import zipfile
31 import traceback
32 from string import ascii_letters
33 from pprint import pformat
35 try:
36 import simplejson as json
37 except ImportError:
38 import json
40 import lxml.html
41 from lxml import etree
43 from objavi import config, epub_utils
44 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
45 from objavi.book_utils import ObjaviError
46 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
47 from objavi.epub import add_guts, _find_tag
48 from objavi.xhtml_utils import EpubChapter, split_tree
50 from iarchive import epub as ia_epub
51 from booki.bookizip import get_metadata, add_metadata
53 TMPDIR = os.path.abspath(config.TMPDIR)
54 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
55 HTTP_HOST = os.environ.get('HTTP_HOST', '')
57 def find_archive_urls(bookid, bookname):
58 s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
59 detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
60 return (s3url, detailsurl)
62 def _get_best_title(tocpoint):
63 if 'html_title' in tocpoint:
64 return tocpoint['html_title']
65 if 'title' in tocpoint:
66 return tocpoint['title']
67 return 'Untitled'
70 def _add_initial_number(e, n):
71 """Put a styled chapter number n at the beginning of element e."""
72 initial = e.makeelement("strong", Class="initial")
73 e.insert(0, initial)
74 initial.tail = ' '
75 if e.text is not None:
76 initial.tail += e.text
77 e.text = ''
78 initial.text = "%s." % n
80 def expand_toc(toc, depth=1, index=0):
81 """Reformat toc slightly for convenience"""
82 for item in toc:
83 url = item['url'].lstrip('/')
84 bits = url.split('#', 1)
85 filename = bits[0]
86 fragment = (bits[1] if len(bits) == 2 else None)
87 item['depth'] = depth
88 item["filename"] = filename
89 item["fragment"] = fragment
90 item["index"] = index
91 index += 1
92 if 'children' in item:
93 index = expand_toc(item['children'], depth + 1, index)
94 return index
96 def _serialise(rtoc, stoc, depth):
97 for item in rtoc:
98 url = item['url'].lstrip('/')
99 bits = url.split('#', 1)
100 filename = bits[0]
101 fragment = (bits[1] if len(bits) == 2 else None)
102 stoc.append({"depth": depth,
103 "title": item['title'],
104 "url": url,
105 "filename": filename,
106 "fragment": fragment,
107 "type": item['type']
109 if 'children' in item:
110 _serialise(item['children'], stoc, depth + 1)
113 def serialise_toc(rtoc):
114 """Take the recursive TOC structure and turn it into a list of
115 serial points. Reformat some things for convenience."""
116 stoc = []
117 _serialise(rtoc, stoc, 1)
118 for i, x in enumerate(stoc):
119 x['position'] = i
120 return stoc
122 def filename_toc_map(rtoc):
123 tocmap = {}
124 log(rtoc)
125 def traverse(toc):
126 for point in toc:
127 log(point.keys())
128 tocmap.setdefault(point['filename'], []).append(point)
129 if 'children' in point:
130 traverse(point['children'])
131 traverse(rtoc)
132 return tocmap
134 def save_data(fn, data):
135 """Save without tripping up on unicode"""
136 if isinstance(data, unicode):
137 data = data.encode('utf8', 'ignore')
138 f = open(fn, 'w')
139 f.write(data)
140 f.close()
143 class Book(object):
144 page_numbers = 'latin'
145 preamble_page_numbers = 'roman'
147 def notify_watcher(self, message=None):
148 if self.watchers:
149 if message is None:
150 #message is the name of the caller
151 message = traceback.extract_stack(None, 2)[0][2]
152 log("notify_watcher called with '%s'" % message)
153 for w in self.watchers:
154 w(message)
156 def __enter__(self):
157 return self
159 def __exit__(self, exc_type, exc_value, tb):
160 self.notify_watcher(config.FINISHED_MESSAGE)
161 self.cleanup()
162 #could deal with exceptions here and return true
165 def __init__(self, book, server, bookname,
166 page_settings=None, watchers=None, isbn=None,
167 license=config.DEFAULT_LICENSE, title=None,
168 max_age=0):
169 log("*** Starting new book %s ***" % bookname)
170 self.watchers = set()
171 if watchers is not None:
172 self.watchers.update(watchers)
173 self.notify_watcher('start')
174 self.bookname = bookname
175 self.book = book
176 self.server = server
177 self.cookie = ''.join(random.sample(ascii_letters, 10))
178 try:
179 blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
180 except HTTPError, e:
181 traceback.print_exc()
182 self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
183 #not much to do?
184 #raise 502 Bad Gateway ?
185 sys.exit()
186 f = StringIO(blob)
187 self.notify_watcher('fetch_zip')
188 self.store = zipfile.ZipFile(f, 'r')
189 self.info = json.loads(self.store.read('info.json'))
190 for k in ('manifest', 'metadata', 'spine', 'TOC'):
191 if k not in self.info:
192 raise ObjaviError('info.json of %s lacks vital element "%s"' %
193 (bookname, k))
194 #check types also?
196 self.metadata = self.info['metadata']
197 self.spine = self.info['spine']
198 self.manifest = self.info['manifest']
200 if server == config.LOCALHOST: # [DEPRECATED]
201 server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
202 book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
204 log(pformat(self.metadata))
205 self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
206 if not self.lang:
207 self.lang = guess_lang(server, book)
208 log('guessed lang as %s' % self.lang)
210 self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
211 if not self.toc_header:
212 self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
214 self.dir = get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0]
215 if not self.dir:
216 self.dir = guess_text_dir(server, book)
219 #Patch in the extra metadata. (lang and dir may be set from config)
220 #these should be read from zip -- so should go into zip?
221 for var, key, scheme, ns in (
222 (isbn, 'id', 'ISBN', config.DC),
223 (license, 'rights', 'License', config.DC),
224 (title, 'title', '', config.DC),
225 (self.lang, 'language', '', config.DC),
226 (self.dir, 'dir', '', config.FM),
228 if var is not None:
229 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
231 self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
232 self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
234 self.toc = self.info['TOC']
235 expand_toc(self.toc)
237 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
238 os.chmod(self.workdir, 0755)
240 self.body_html_file = self.filepath('body.html')
241 self.body_pdf_file = self.filepath('body.pdf')
242 self.preamble_html_file = self.filepath('preamble.html')
243 self.preamble_pdf_file = self.filepath('preamble.pdf')
244 self.tail_html_file = self.filepath('tail.html')
245 self.tail_pdf_file = self.filepath('tail.pdf')
246 self.isbn_pdf_file = None
247 self.pdf_file = self.filepath('final.pdf')
248 self.body_odt_file = self.filepath('body.odt')
250 self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
252 if page_settings is not None:
253 self.maker = PageSettings(**page_settings)
255 if title is not None:
256 self.title = title
257 else:
258 titles = get_metadata(self.metadata, 'title')
259 if titles:
260 self.title = titles[0]
261 else:
262 self.title = 'A Book About ' + self.book
264 self.notify_watcher()
267 if config.TRY_BOOK_CLEANUP_ON_DEL:
268 #Dont even define __del__ if it is not used.
269 _try_cleanup_on_del = True
270 def __del__(self):
271 if self._try_cleanup_on_del and os.path.exists(self.workdir):
272 self._try_cleanup_on_del = False #or else you can get in bad cycles
273 self.cleanup()
275 def get_tree_by_id(self, id):
276 """get an HTML tree from the given manifest ID"""
277 name = self.manifest[id]['url']
278 mimetype = self.manifest[id]['mimetype']
279 s = self.store.read(name)
280 f = StringIO(s)
281 if mimetype == 'text/html':
282 try:
283 tree = lxml.html.parse(f)
284 except etree.XMLSyntaxError, e:
285 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
286 (id, name, s[:20], e))
287 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
288 elif 'xml' in mimetype: #XXX or is this just asking for trouble?
289 tree = etree.parse(f)
290 else:
291 tree = f.read()
292 f.close()
293 return tree
295 def filepath(self, fn):
296 return os.path.join(self.workdir, fn)
298 def save_tempfile(self, fn, data):
299 """Save the data in a temporary directory that will be cleaned
300 up when all is done. Return the absolute file path."""
301 fn = self.filepath(fn)
302 save_data(fn, data)
303 return fn
305 def make_oo_doc(self):
306 """Make an openoffice document, using the html2odt script."""
307 self.wait_for_xvfb()
308 html_text = etree.tostring(self.tree, method="html")
309 save_data(self.body_html_file, html_text)
310 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
311 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
312 os.rename(self.body_odt_file, self.publish_file)
313 self.notify_watcher()
315 def extract_pdf_outline(self):
316 #self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
317 debugf = self.filepath('outline.txt')
318 self.outline_contents, self.outline_text, number_of_pages = \
319 parse_outline(self.body_pdf_file, 1, debugf)
321 if not self.outline_contents:
322 #probably problems with international text. need a horrible hack
323 log('no outline: trying again with ascii headings')
324 import copy
325 tree = copy.deepcopy(self.tree)
326 titlemap = {}
327 for tag in ('h1', 'h2', 'h3', 'h4'):
328 for i, e in enumerate(tree.getiterator(tag)):
329 key = "%s_%s" % (tag, i)
330 titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
331 del e[:]
332 if tag == 'h1':
333 e = lxml.etree.SubElement(e, "strong", Class="initial")
334 e.text = key
335 log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
337 ascii_html_file = self.filepath('body-ascii-headings.html')
338 ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
339 html_text = lxml.etree.tostring(tree, method="html")
340 save_data(ascii_html_file, html_text)
341 self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
342 debugf = self.filepath('ascii_outline.txt')
343 ascii_contents, ascii_text, number_of_ascii_pages = \
344 parse_outline(ascii_pdf_file, 1, debugf)
345 self.outline_contents = []
346 log ("number of pages: %s, post ascii: %s" %
347 (number_of_pages, number_of_ascii_pages))
348 for ascii_title, depth, pageno in ascii_contents:
349 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
350 ascii_title = ascii_title[:-4]
351 if ' ' in ascii_title:
352 ascii_title = ascii_title.rsplit(' ', 1)[1]
353 title = titlemap.get(ascii_title, '')
354 log((ascii_title, title, depth, pageno))
356 self.outline_contents.append((title, depth, pageno))
357 else:
358 for x in self.outline_contents:
359 log(x)
361 self.notify_watcher()
362 return number_of_pages
364 def make_body_pdf(self):
365 """Make a pdf of the HTML, using webkit"""
366 #1. Save the html
367 html_text = etree.tostring(self.tree, method="html")
368 save_data(self.body_html_file, html_text)
370 #2. Make a pdf of it
371 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
372 self.notify_watcher('generate_pdf')
374 n_pages = self.extract_pdf_outline()
376 log ("found %s pages in pdf" % n_pages)
377 #4. resize pages, shift gutters, even pages
378 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
379 self.notify_watcher('reshape_pdf')
381 #5 add page numbers
382 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
383 numbers=self.page_numbers)
384 self.notify_watcher("number_pdf")
385 self.notify_watcher()
387 def make_preamble_pdf(self):
388 contents = self.make_contents()
389 inside_cover_html = self.compose_inside_cover()
390 log(self.dir, self.css_url, self.title, inside_cover_html,
391 self.toc_header, contents, self.title)
393 html = ('<html dir="%s"><head>\n'
394 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
395 '<link rel="stylesheet" href="%s" />\n'
396 '</head>\n<body>\n'
397 '<h1 class="frontpage">%s</h1>'
398 '%s\n'
399 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
400 '<div style="page-break-after: always; color:#fff" class="unseen">.'
401 '<!--%s--></div></body></html>'
402 ) % (self.dir, self.css_url, self.title, inside_cover_html.decode('utf-8'),
403 self.toc_header, contents, self.title)
404 save_data(self.preamble_html_file, html)
406 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
408 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
410 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
411 numbers=self.preamble_page_numbers,
412 number_start=-2)
414 self.notify_watcher()
416 def make_end_matter_pdf(self):
417 """Make an inside back cover and a back cover. If there is an
418 isbn number its barcode will be put on the back cover."""
419 if self.isbn:
420 self.isbn_pdf_file = self.filepath('isbn.pdf')
421 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
422 self.notify_watcher('make_barcode_pdf')
424 end_matter = self.compose_end_matter()
425 log(end_matter)
426 save_data(self.tail_html_file, end_matter.decode('utf-8'))
427 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
429 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
430 centre_end=True, even_pages=False)
431 self.notify_watcher()
433 def make_book_pdf(self):
434 """A convenient wrapper of a few necessary steps"""
435 # now the Xvfb server is needed. make sure it has had long enough to get going
436 self.wait_for_xvfb()
437 self.make_body_pdf()
438 self.make_preamble_pdf()
439 self.make_end_matter_pdf()
441 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
442 self.body_pdf_file, self.tail_pdf_file,
443 self.isbn_pdf_file)
445 self.notify_watcher('concatenated_pdfs')
448 def make_simple_pdf(self, mode):
449 """Make a simple pdf document without contents or separate
450 title page. This is used for multicolumn newspapers and for
451 web-destined pdfs."""
452 self.wait_for_xvfb()
453 #0. Add heading to begining of html
454 body = list(self.tree.cssselect('body'))[0]
455 e = body.makeelement('h1', {'id': 'book-title'})
456 e.text = self.title
457 body.insert(0, e)
458 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
459 e.addnext(intro)
461 #0.5 adjust parameters to suit the particular kind of output
462 if mode == 'web':
463 self.maker.gutter = 0
465 #1. Save the html
466 html_text = etree.tostring(self.tree, method="html")
467 save_data(self.body_html_file, html_text)
469 #2. Make a pdf of it (direct to to final pdf)
470 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
471 self.notify_watcher('generate_pdf')
472 n_pages = count_pdf_pages(self.pdf_file)
474 if mode != 'web':
475 #3. resize pages and shift gutters.
476 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
477 self.notify_watcher('reshape_pdf')
479 #4. add page numbers
480 self.maker.number_pdf(self.pdf_file, n_pages,
481 dir=self.dir, numbers=self.page_numbers)
482 self.notify_watcher("number_pdf")
483 self.notify_watcher()
486 def rotate180(self):
487 """Rotate the pdf 180 degrees so an RTL book can print on LTR
488 presses."""
489 rotated = self.filepath('final-rotate.pdf')
490 unrotated = self.filepath('final-pre-rotate.pdf')
491 #leave the unrotated pdf intact at first, in case of error.
492 rotate_pdf(self.pdf_file, rotated)
493 os.rename(self.pdf_file, unrotated)
494 os.rename(rotated, self.pdf_file)
495 self.notify_watcher()
497 def publish_pdf(self):
498 """Move the finished PDF to its final resting place"""
499 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
500 os.rename(self.pdf_file, self.publish_file)
501 self.notify_watcher()
503 def publish_bookizip(self):
504 """Publish the bookizip. For this, copy rather than move,
505 because the bookizip might be used by further processing. If
506 possible, a hard link is created."""
507 log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
508 try:
509 run(['cp', '-l', self.bookizip_file, self.publish_file])
510 except OSError:
511 run(['cp', self.bookizip_file, self.publish_file])
512 self.notify_watcher()
514 def concat_html(self):
515 """Join all the chapters together into one tree. Keep the TOC
516 up-to-date along the way."""
518 #each manifest item looks like:
519 #{'contributors': []
520 #'license': [],
521 #'mimetype': '',
522 #'rightsholders': []
523 #'url': ''}
524 doc = lxml.html.document_fromstring('<html><body></body></html>')
525 tocmap = filename_toc_map(self.toc)
526 for ID in self.spine:
527 details = self.manifest[ID]
528 log(ID, pformat(details))
529 # ACO MIJENJAO
530 try:
531 root = self.get_tree_by_id(ID).getroot()
532 except Exception, e:
533 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
534 continue
535 #handle any TOC points in this file
536 for point in tocmap[details['url']]:
537 #if the url has a #identifier, use it. Otherwise, make
538 #one up, using a hidden element at the beginning of
539 #the inserted document.
540 #XXX this will break if different files use the same ids
541 #XXX should either replace all, or replace selectively.
542 if point['fragment']:
543 fragment = point['fragment']
544 else:
545 body = _find_tag(root, 'body')
546 fragment = '%s_%s' % (self.cookie, point['index'])
547 #reuse first tag if it is suitable.
548 if (len(body) and
549 body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
550 if body[0].get('id') is None:
551 body[0].set('id', fragment)
552 else:
553 fragment = body[0].get('id')
554 #the chapter starts with a heading. that heading should be the chapter name.
555 if body[0].tag in ('h1', 'h2', 'h3'):
556 log('chapter has title "%s", found html title "%s"' %
557 (point['title'], body[0].text_content()))
558 point['html_title'] = body[0].text_content()
559 else:
560 marker = body.makeelement('div', style="display:none",
561 id=fragment)
562 body.insert(0, marker)
563 point['html_id'] = fragment
565 add_guts(root, doc)
566 return doc
568 def unpack_static(self):
569 """Extract static files from the zip for the html to refer to."""
570 static_files = [x['url'] for x in self.manifest.values()
571 if x['url'].startswith('static')]
572 if static_files:
573 os.mkdir(self.filepath('static'))
575 for name in static_files:
576 s = self.store.read(name)
577 f = open(self.filepath(name), 'w')
578 f.write(s)
579 f.close()
580 self.notify_watcher()
582 def load_book(self):
583 """"""
584 #XXX concatenate the HTML to match how TWiki version worked.
585 # This is perhaps foolishly early -- throwing away useful boundaries.
586 self.unpack_static()
587 self.tree = self.concat_html()
588 self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
590 self.headings = [x for x in self.tree.cssselect('h1')]
591 if self.headings:
592 self.headings[0].set('class', "first-heading")
593 for h1 in self.headings:
594 h1.title = h1.text_content().strip()
595 self.notify_watcher()
597 def make_contents(self):
598 """Generate HTML containing the table of contents. This can
599 only be done after the main PDF has been made, because the
600 page numbers are contained in the PDF outline."""
601 header = '<h1>Table of Contents</h1><table class="toc">\n'
602 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
603 '<td class="pagenumber">%s</td></tr>\n')
604 empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
605 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
606 footer = '\n</table>'
608 contents = []
610 chapter = 1
611 page_num = 1
613 outline_contents = iter(self.outline_contents)
615 for section in self.toc:
616 if not section.get('children'):
617 contents.append(empty_section_tmpl % section['title'])
618 continue
619 contents.append(section_tmpl % section['title'])
621 for point in section['children']:
622 try:
623 h1_text, level, page_num = outline_contents.next()
624 except StopIteration:
625 log("contents data not found for %s. Stopping" % (point,))
626 break
627 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
628 chapter += 1
630 doc = header + '\n'.join(contents) + footer
631 self.notify_watcher()
632 return doc
634 def add_section_titles(self):
635 """Add any section heading pages that the TOC.txt file
636 specifies. These are sub-book, super-chapter groupings.
638 Also add initial numbers to chapters.
640 chapter = 1
641 section = None
642 log(self.toc)
643 for t in self.toc:
644 #only top level sections get a subsection page,
645 #and only if they have children.
646 if t.get('children'):
647 section = self.tree.makeelement('div', Class="objavi-subsection")
648 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
649 heading.text = t['title']
650 for child in t['children']:
651 item = etree.SubElement(section, 'div', Class="objavi-chapter")
652 if 'html_title' in child:
653 item.text = child['html_title']
654 heading = self.tree.cssselect('#'+ child['html_id'])
655 if heading:
656 _add_initial_number(heading[0], chapter)
657 else:
658 item.text = child['title']
659 _add_initial_number(item, chapter)
660 log(item.text, debug='HTMLGEN')
661 chapter += 1
662 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
663 location = self.tree.cssselect('#'+ t['html_id'])[0]
664 location.addprevious(section)
667 self.notify_watcher()
670 def add_css(self, css=None, mode='book'):
671 """If css looks like a url, use it as a stylesheet link.
672 Otherwise it is the CSS itself, which is saved to a temporary file
673 and linked to."""
674 log("css is %r" % css)
675 htmltree = self.tree
676 if css is None or not css.strip():
677 css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
678 if css_default is None:
679 #guess from language -- this should come first
680 css_modes = config.LANGUAGE_CSS.get(self.lang,
681 config.LANGUAGE_CSS['en'])
682 css_default = css_modes.get(mode, css_modes[None])
683 url = 'file://' + os.path.abspath(css_default)
684 elif not re.match(r'^http://\S+$', css):
685 fn = self.save_tempfile('objavi.css', css)
686 url = 'file://' + fn
687 else:
688 url = css
689 #XXX for debugging and perhaps sensible anyway
690 #url = url.replace('file:///home/douglas/objavi2', '')
693 #find the head -- it's probably first child but lets not assume.
694 for child in htmltree:
695 if child.tag == 'head':
696 head = child
697 break
698 else:
699 head = htmltree.makeelement('head')
700 htmltree.insert(0, head)
702 link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
703 self.css_url = url
704 self.notify_watcher()
705 return url
708 def _read_localised_template(self, template, fallbacks=['en']):
709 """Try to get the template in the approriate language, otherwise in english."""
710 for lang in [self.lang] + fallbacks:
711 try:
712 fn = template % (lang)
713 f = open(fn)
714 break
715 except IOError, e:
716 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
717 log(e)
718 template = f.read()
719 f.close()
720 return template
722 def compose_inside_cover(self):
723 """create the markup for the preamble inside cover."""
724 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
726 if self.isbn:
727 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
728 else:
729 isbn_text = ''
731 return template % {'date': time.strftime('%Y-%m-%d'),
732 'isbn': isbn_text,
733 'license': self.license,
737 def compose_end_matter(self):
738 """create the markup for the end_matter inside cover. If
739 self.isbn is not set, the html will result in a pdf that
740 spills onto two pages.
742 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
744 d = {'css_url': self.css_url,
745 'title': self.title
748 if self.isbn:
749 d['inside_cover_style'] = ''
750 else:
751 d['inside_cover_style'] = 'page-break-after: always'
753 return template % d
756 def make_epub(self, use_cache=False):
757 """Make an epub version of the book, using Mike McCabe's
758 epub module for the Internet Archive."""
759 ebook = ia_epub.Book(self.publish_file, content_dir='')
760 def add_file(ID, filename, mediatype, content):
761 ebook.add_content({'media-type': mediatype.encode('utf-8'),
762 'id': ID.encode('utf-8'),
763 'href': filename.encode('utf-8'),
764 }, content)
766 toc = self.info['TOC']
768 #manifest
769 filemap = {} #map html to corresponding xhtml
770 spinemap = {} #map IDs to multi-file chapters
771 for ID in self.manifest:
772 details = self.manifest[ID]
773 log(ID, pformat(details))
774 fn, mediatype = details['url'], details['mimetype']
775 content = self.store.read(fn)
776 if mediatype == 'text/html':
777 #convert to application/xhtml+xml, and perhaps split
778 c = EpubChapter(self.server, self.book, ID, content,
779 use_cache=use_cache)
780 c.remove_bad_tags()
781 if fn[-5:] == '.html':
782 fnbase = fn[:-5]
783 else:
784 fnbase = fn
785 fnx = fnbase + '.xhtml'
786 mediatype = 'application/xhtml+xml'
788 fragments = split_html(c.as_xhtml(),
789 compressed_size=self.store.getinfo(fn).compress_size)
791 #add the first one as if it is the whole thing (as it often is)
792 add_file(ID, fnx, mediatype, fragments[0])
793 filemap[fn] = fnx
794 if len(fragments) > 1:
795 spine_ids = [ID]
796 spinemap[ID] = spine_ids
797 #add any extras
798 for i in range(1, len(fragments)):
799 # XXX it is possible for duplicates if another
800 # file happens to have this name. Ignore for now
801 _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
802 spine_ids.append(_id)
803 add_file(_id,
804 '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
805 mediatype, fragments[i])
807 else:
808 add_file(ID, fn, mediatype, content)
810 #toc
811 ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
812 ebook.add(ebook.content_dir + 'toc.ncx', ncx)
814 #spine
815 for ID in self.spine:
816 if ID in spinemap:
817 for x in spinemap[ID]:
818 ebook.add_spine_item({'idref': x})
819 else:
820 ebook.add_spine_item({'idref': ID})
822 #metadata -- no use of attributes (yet)
823 # and fm: metadata disappears for now
824 DCNS = config.DCNS
825 DC = config.DC
826 meta_info_items = []
827 for ns, namespace in self.metadata.items():
828 for keyword, schemes in namespace.items():
829 if ns:
830 keyword = '{%s}%s' % (ns, keyword)
831 for scheme, values in schemes.items():
832 for value in values:
833 item = {
834 'item': keyword,
835 'text': value,
837 if scheme:
838 if keyword in (DCNS + 'creator', DCNS + 'contributor'):
839 item['atts'] = {'role': scheme}
840 else:
841 item['atts'] = {'scheme': scheme}
843 has_authors = 'creator' in self.metadata[DC]
844 if not has_authors and config.CLAIM_UNAUTHORED:
845 authors = []
846 for x in self.metadata[DC]['creator'].values():
847 authors.extend(x)
849 meta_info_items.append({'item': DCNS + 'creator',
850 'text': 'The Contributors'})
852 meta_info_items.append({'item': DCNS + 'rights',
853 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
856 tree_str = ia_epub.make_opf(meta_info_items,
857 ebook.manifest_items,
858 ebook.spine_items,
859 ebook.guide_items,
860 ebook.cover_id)
861 ebook.add(ebook.content_dir + 'content.opf', tree_str)
862 ebook.z.close()
863 self.notify_watcher()
866 def publish_s3(self):
867 """Push the book's epub to archive.org, using S3."""
868 #XXX why only epub?
869 secrets = {}
870 for x in ('S3_SECRET', 'S3_ACCESSKEY'):
871 fn = getattr(config, x)
872 f = open(fn)
873 secrets[x] = f.read().strip()
874 f.close()
876 log(secrets)
877 now = time.strftime('%F')
878 s3output = self.filepath('s3-output.txt')
879 s3url, detailsurl = find_archive_urls(self.book, self.bookname)
880 headers = [
881 'x-amz-auto-make-bucket:1',
882 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
883 'x-archive-meta-mediatype:texts',
884 'x-archive-meta-collection:opensource',
885 'x-archive-meta-title:%s' % (self.book,),
886 'x-archive-meta-date:%s' % (now,),
887 'x-archive-meta-creator:FLOSS Manuals Contributors',
890 if self.license in config.LICENSES:
891 headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
893 argv = ['curl', '--location', '-s', '-o', s3output]
894 for h in headers:
895 argv.extend(('--header', h))
896 argv.extend(('--upload-file', self.publish_file, s3url,))
898 log(' '.join(repr(x) for x in argv))
899 check_call(argv, stdout=sys.stderr)
900 self.notify_watcher()
901 return detailsurl, s3url
904 def spawn_x(self):
905 """Start an Xvfb instance, using a new server number. A
906 reference to it is stored in self.xvfb, which is used to kill
907 it when the pdf is done.
909 Note that Xvfb doesn't interact well with dbus which is
910 present on modern desktops.
912 #Find an unused server number (in case two cgis are running at once)
913 while True:
914 servernum = random.randrange(50, 500)
915 if not os.path.exists('/tmp/.X%s-lock' % servernum):
916 break
918 self.xserver_no = ':%s' % servernum
920 authfile = self.filepath('Xauthority')
921 os.environ['XAUTHORITY'] = authfile
923 #mcookie(1) eats into /dev/random, so avoid that
924 from hashlib import md5
925 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
926 mcookie = m.hexdigest()
928 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
930 self.xvfb = Popen(['Xvfb', self.xserver_no,
931 '-screen', '0', '1024x768x24',
932 '-pixdepths', '32',
933 #'-blackpixel', '0',
934 #'-whitepixel', str(2 ** 24 -1),
935 #'+extension', 'Composite',
936 '-dpi', '96',
937 #'-kb',
938 '-nolisten', 'tcp',
941 # We need to wait a bit before the Xvfb is ready. but the
942 # downloads are so slow that that probably doesn't matter
944 self.xvfb_ready_time = time.time() + 2
946 os.environ['DISPLAY'] = self.xserver_no
947 log(self.xserver_no)
949 def wait_for_xvfb(self):
950 """wait until a previously set time before continuing. This
951 is so Xvfb has time to properly start."""
952 if hasattr(self, 'xvfb'):
953 d = self.xvfb_ready_time - time.time()
954 if d > 0:
955 time.sleep(d)
956 self.notify_watcher()
958 def cleanup_x(self):
959 """Try very hard to kill off Xvfb. In addition to killing
960 this instance's xvfb, occasionally (randomly) search for
961 escaped Xvfb instances and kill those too."""
962 if not hasattr(self, 'xvfb'):
963 return
964 check_call(['xauth', 'remove', self.xserver_no])
965 p = self.xvfb
966 log("trying to kill Xvfb %s" % p.pid)
967 os.kill(p.pid, 15)
968 for i in range(10):
969 if p.poll() is not None:
970 log("%s died with %s" % (p.pid, p.poll()))
971 break
972 log("%s not dead yet" % p.pid)
973 time.sleep(0.2)
974 else:
975 log("Xvfb would not die! kill -9! kill -9!")
976 os.kill(p.pid, 9)
978 if random.random() < 0.1:
979 # occasionally kill old xvfbs and soffices, if there are any.
980 self.kill_old_processes()
982 def kill_old_processes(self):
983 """Sometimes, despite everything, Xvfb or soffice instances
984 hang around well after they are wanted -- for example if the
985 cgi process dies particularly badly. So kill them if they have
986 been running for a long time."""
987 log("running kill_old_processes")
988 killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
989 os.path.basename(config.HTML2ODT),
990 os.path.basename(config.WKHTMLTOPDF),
992 p = Popen(['ps', '-C', killable_names,
993 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
994 data = p.communicate()[0].strip()
995 if data:
996 lines = data.split('\n')
997 pids = []
998 for line in lines:
999 log('dealing with ps output "%s"' % line)
1000 try:
1001 pid, days, hours, minutes, seconds \
1002 = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1003 except AttributeError:
1004 log("Couldn't parse that line!")
1005 # 50 minutes should be enough xvfb time for anyone
1006 if days or hours or int(minutes) > 50:
1007 pid = int(pid)
1008 log("going to kill pid %s" % pid)
1009 os.kill(pid, 15)
1010 pids.append(pid)
1012 time.sleep(1.0)
1013 for pid in pids:
1014 #try again in case any are lingerers
1015 try:
1016 os.kill(int(pid), 9)
1017 except OSError, e:
1018 log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1019 continue
1020 log('killing %s with -9' % pid)
1021 self.notify_watcher()
1023 def cleanup(self):
1024 self.cleanup_x()
1025 if not config.KEEP_TEMP_FILES:
1026 for fn in os.listdir(self.workdir):
1027 os.remove(os.path.join(self.workdir, fn))
1028 os.rmdir(self.workdir)
1029 else:
1030 log("NOT removing '%s', containing the following files:" % self.workdir)
1031 log(*os.listdir(self.workdir))
1033 self.notify_watcher()
1036 def use_cache():
1037 return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1039 def _read_cached_zip(server, book, max_age):
1040 #find a recent zip if possible
1041 prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1042 from glob import glob
1043 zips = sorted(glob(prefix + '*.zip'))
1044 if not zips:
1045 log("no cached booki-zips matching %s*.zip" % (prefix,))
1046 return None
1047 zipname = zips[-1]
1048 cutoff = time.time() - max_age * 60
1049 log(repr(zipname))
1050 try:
1051 date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1052 if date > cutoff:
1053 f = open(zipname)
1054 blob = f.read()
1055 f.close()
1056 return blob, zipname
1057 log("%s is too old, must reload" % zipname)
1058 return None
1059 except (IOError, IndexError, ValueError), e:
1060 log('could not make sense of %s: got exception %s' % (zipname, e))
1061 return None
1064 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1065 interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1066 try:
1067 url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1068 'server': server, 'book':book}
1069 except KeyError:
1070 raise NotImplementedError("Can't handle '%s' interface" % interface)
1072 if use_cache() and max_age < 0:
1073 #default to 12 hours cache on objavi.halo.gen.nz
1074 max_age = 12 * 60
1076 if max_age:
1077 log('WARNING: trying to use cached booki-zip',
1078 'If you are debugging booki-zip creation, you will go CRAZY'
1079 ' unless you switch this off')
1080 blob_and_name = _read_cached_zip(server, book, max_age)
1081 if blob_and_name is not None:
1082 return blob_and_name
1084 log('fetching zip from %s'% url)
1085 f = urlopen(url)
1086 blob = f.read()
1087 f.close()
1088 if save:
1089 if filename is None:
1090 filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1091 make_book_name(book, server, '.zip'))
1092 f = open(filename, 'w')
1093 f.write(blob)
1094 f.close()
1095 return blob, filename
1098 def split_html(html, compressed_size=None, fix_markup=False):
1099 """Split long html files into pieces that will work nicely on a
1100 Sony Reader."""
1101 if compressed_size is None:
1102 import zlib
1103 compressed_size = len(zlib.compress(html))
1105 splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1106 len(html) // config.EPUB_FILE_SIZE_MAX)
1107 log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1109 if not splits:
1110 return [html]
1112 if fix_markup:
1113 #remove '<' in attributes etc, which makes the marker
1114 #insertion more reliable
1115 html = etree.tostring(lxml.html.fromstring(html),
1116 encoding='UTF-8',
1117 #method='html'
1120 target = len(html) // (splits + 1)
1121 s = 0
1122 fragments = []
1123 for i in range(splits):
1124 e = html.find('<', target * (i + 1))
1125 fragments.append(html[s:e])
1126 fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1127 s = e
1128 fragments.append(html[s:])
1130 #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1131 chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1132 return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]