attach bookname to self for s3 publishing
[objavi2.git] / objavi / fmbook.py
blob5471f30403de9264f828a09371ff7f2c1466bfb3
1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
21 PDF"""
23 import os, sys
24 import tempfile
25 import re, time
26 import random
27 from subprocess import Popen, check_call, PIPE
28 from cStringIO import StringIO
29 import zipfile
30 import traceback
31 try:
32 import simplejson as json
33 except ImportError:
34 import json
36 import lxml, lxml.html, lxml.etree
38 from objavi import config, twiki_wrapper, epub_utils
39 from objavi.cgi_utils import log, run, shift_file
40 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
42 from iarchive import epub as ia_epub
43 from booki.xhtml_utils import EpubChapter
45 TMPDIR = os.path.abspath(config.TMPDIR)
46 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
47 HTTP_HOST = os.environ.get('HTTP_HOST', '')
48 PUBLISH_PATH = "%s/books/" % DOC_ROOT
50 def make_book_name(book, server, suffix='.pdf'):
51 lang = config.SERVER_DEFAULTS.get(server, config.SERVER_DEFAULTS[config.DEFAULT_SERVER])['lang']
52 book = ''.join(x for x in book if x.isalnum())
53 return '%s-%s-%s%s' % (book, lang,
54 time.strftime('%Y.%m.%d-%H.%M.%S'),
55 suffix)
57 def _add_initial_number(e, n):
58 """Put a styled chapter number n at the beginning of element e."""
59 initial = e.makeelement("strong", Class="initial")
60 e.insert(0, initial)
61 initial.tail = ' '
62 if e.text is not None:
63 initial.tail += e.text
64 e.text = ''
65 initial.text = "%s." % n
68 class TocItem(object):
69 """This makes sense of the tuples from TOC.txt files"""
70 def __init__(self, status, chapter, title):
71 # status is
72 # 0 - section heading with no chapter
73 # 1 - chapter heading
74 # 2 - book title
76 # chapter is twiki name of the chapter
77 # title is a human readable name of the chapter.
78 self.status = status
79 self.chapter = chapter
80 self.title = title
82 def is_chapter(self):
83 return self.status == '1'
85 def is_section(self):
86 return self.status == '0'
88 def is_title(self):
89 return self.status == '2'
91 def __str__(self):
92 return '<toc: %s>' % ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
95 class Book(object):
96 page_numbers = 'latin'
97 preamble_page_numbers = 'roman'
99 def notify_watcher(self, message=None):
100 if self.watcher:
101 if message is None:
102 #message is the name of the caller
103 message = traceback.extract_stack(None, 2)[0][2]
104 log("notify_watcher called with '%s'" % message)
105 self.watcher(message)
107 def __enter__(self):
108 return self
110 def __exit__(self, exc_type, exc_value, traceback):
111 self.cleanup()
112 #could deal with exceptions here and return true
114 def __init__(self, book, server, bookname,
115 page_settings=None, watcher=None, isbn=None,
116 license=config.DEFAULT_LICENSE):
117 log("*** Starting new book %s ***" % bookname)
118 self.book = book
119 self.server = server
120 self.watcher = watcher
121 self.isbn = isbn
122 self.license = license
123 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
124 os.chmod(self.workdir, 0755)
125 defaults = config.SERVER_DEFAULTS[server]
126 self.lang = defaults['lang']
127 self.dir = defaults['dir']
129 self.body_html_file = self.filepath('body.html')
130 self.body_pdf_file = self.filepath('body.pdf')
131 self.preamble_html_file = self.filepath('preamble.html')
132 self.preamble_pdf_file = self.filepath('preamble.pdf')
133 self.tail_html_file = self.filepath('tail.html')
134 self.tail_pdf_file = self.filepath('tail.pdf')
135 self.isbn_pdf_file = None
136 self.pdf_file = self.filepath('final.pdf')
137 self.body_odt_file = self.filepath('body.odt')
139 self.publish_name = bookname
140 self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
141 self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
143 if page_settings is not None:
144 self.maker = PageSettings(**page_settings)
146 self.notify_watcher()
148 if config.TRY_BOOK_CLEANUP_ON_DEL:
149 #Dont even define __del__ if it is not used.
150 _try_cleanup_on_del = True
151 def __del__(self):
152 if self._try_cleanup_on_del and os.path.exists(self.workdir):
153 self._try_cleanup_on_del = False #or else you can get in bad cycles
154 self.cleanup()
156 def filepath(self, fn):
157 return os.path.join(self.workdir, fn)
159 def save_data(self, fn, data):
160 """Save without tripping up on unicode"""
161 if isinstance(data, unicode):
162 data = data.encode('utf8', 'ignore')
163 f = open(fn, 'w')
164 f.write(data)
165 f.close()
167 def save_tempfile(self, fn, data):
168 """Save the data in a temporary directory that will be cleaned
169 up when all is done. Return the absolute file path."""
170 fn = self.filepath(fn)
171 self.save_data(fn, data)
172 return fn
174 def make_oo_doc(self):
175 """Make an openoffice document, using the html2odt script."""
176 self.wait_for_xvfb()
177 html_text = lxml.etree.tostring(self.tree, method="html")
178 self.save_data(self.body_html_file, html_text)
179 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
180 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
181 os.rename(self.body_odt_file, self.publish_file)
182 self.notify_watcher()
184 def extract_pdf_outline(self):
185 self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
186 for x in self.outline_contents:
187 log(x)
188 self.notify_watcher()
189 return number_of_pages
191 def make_body_pdf(self):
192 """Make a pdf of the HTML, using webkit"""
193 #1. Save the html
194 html_text = lxml.etree.tostring(self.tree, method="html")
195 self.save_data(self.body_html_file, html_text)
197 #2. Make a pdf of it
198 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
199 self.notify_watcher('generate_pdf')
201 n_pages = self.extract_pdf_outline()
203 log ("found %s pages in pdf" % n_pages)
204 #4. resize pages, shift gutters, even pages
205 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
206 self.notify_watcher('reshape_pdf')
208 #5 add page numbers
209 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
210 numbers=self.page_numbers)
211 self.notify_watcher("number_pdf")
212 self.notify_watcher()
214 def make_preamble_pdf(self):
215 contents = self.make_contents()
216 inside_cover_html = self.compose_inside_cover()
217 html = ('<html dir="%s"><head>\n'
218 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
219 '<link rel="stylesheet" href="%s" />\n'
220 '</head>\n<body>\n'
221 '<h1 class="frontpage">%s</h1>'
222 '%s\n'
223 '<div class="contents">%s</div>\n'
224 '<div style="page-break-after: always; color:#fff" class="unseen">.'
225 '<!--%s--></div></body></html>'
226 ) % (self.dir, self.css_url, self.title, inside_cover_html,
227 contents, self.title)
228 self.save_data(self.preamble_html_file, html)
230 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
232 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
234 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
235 numbers=self.preamble_page_numbers,
236 number_start=-2)
238 self.notify_watcher()
240 def make_end_matter_pdf(self):
241 """Make an inside back cover and a back cover. If there is an
242 isbn number its barcode will be put on the back cover."""
243 if self.isbn:
244 self.isbn_pdf_file = self.filepath('isbn.pdf')
245 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
246 self.notify_watcher('make_barcode_pdf')
248 self.save_data(self.tail_html_file, self.compose_end_matter())
249 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
251 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
252 centre_end=True, even_pages=False)
253 self.notify_watcher()
255 def make_book_pdf(self):
256 """A convenient wrapper of a few necessary steps"""
257 # now the Xvfb server is needed. make sure it has had long enough to get going
258 self.wait_for_xvfb()
259 self.make_body_pdf()
260 self.make_preamble_pdf()
261 self.make_end_matter_pdf()
263 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
264 self.body_pdf_file, self.tail_pdf_file,
265 self.isbn_pdf_file)
267 self.notify_watcher('concatenated_pdfs')
270 def make_simple_pdf(self, mode):
271 """Make a simple pdf document without contents or separate
272 title page. This is used for multicolumn newspapers and for
273 web-destined pdfs."""
274 self.wait_for_xvfb()
275 #0. Add heading to begining of html
276 body = list(self.tree.cssselect('body'))[0]
277 e = body.makeelement('h1', {'id': 'book-title'})
278 e.text = self.title
279 body.insert(0, e)
280 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
281 e.addnext(intro)
283 #0.5 adjust parameters to suit the particular kind of output
284 if mode == 'web':
285 self.maker.gutter = 0
287 #1. Save the html
288 html_text = lxml.etree.tostring(self.tree, method="html")
289 self.save_data(self.body_html_file, html_text)
291 #2. Make a pdf of it (direct to to final pdf)
292 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
293 self.notify_watcher('generate_pdf')
294 n_pages = count_pdf_pages(self.pdf_file)
296 if mode != 'web':
297 #3. resize pages and shift gutters.
298 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
299 self.notify_watcher('reshape_pdf')
301 #4. add page numbers
302 self.maker.number_pdf(self.pdf_file, n_pages,
303 dir=self.dir, numbers=self.page_numbers)
304 self.notify_watcher("number_pdf")
305 self.notify_watcher()
308 def rotate180(self):
309 """Rotate the pdf 180 degrees so an RTL book can print on LTR
310 presses."""
311 rotated = self.filepath('final-rotate.pdf')
312 unrotated = self.filepath('final-pre-rotate.pdf')
313 #leave the unrotated pdf intact at first, in case of error.
314 rotate_pdf(self.pdf_file, rotated)
315 os.rename(self.pdf_file, unrotated)
316 os.rename(rotated, self.pdf_file)
317 self.notify_watcher()
319 def publish_pdf(self):
320 """Move the finished PDF to its final resting place"""
321 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
322 os.rename(self.pdf_file, self.publish_file)
323 self.notify_watcher()
325 def get_twiki_metadata(self):
326 """Get information about a twiki book (as much as is easy and useful)."""
327 if not hasattr(self, 'toc'):
328 self.load_toc()
330 title_map = {}
331 authors = {}
332 meta = {
333 'language': self.lang,
334 'identifier': 'http://%s/epub/%s/%s' %(self.server, self.book, time.strftime('%Y.%m.%d-%H.%M.%S')),
335 'publisher': 'FLOSS Manuals http://flossmanuals.net',
336 'creator': 'The Contributors',
337 'date': time.strftime('%Y-%m-%d'),
338 'fm:server': self.server,
339 'fm:book': self.book,
340 'title': self.book,
342 spine = []
343 toc = []
344 section = toc
345 for t in self.toc:
346 if t.is_chapter():
347 spine.append(t.chapter)
348 section.append((t.title, t.chapter + '.html')) #XXX
349 title_map[t.title] = t.chapter
350 elif t.is_section():
351 section = []
352 toc.append([[t.title, None], section])
353 elif t.is_title():
354 meta['title'] = t.title
356 author_copyright, chapter_copyright = twiki_wrapper.get_book_copyright(self.server, self.book, title_map)
358 return {
359 'metadata': meta,
360 'TOC': toc,
361 'spine': spine,
362 'copyright': author_copyright,
363 #'chapter_copyright': chapter_copyright,
366 def load_toc(self):
367 """From the TOC.txt file create a list of TocItems with
368 the attributes <status>, <chapter>, and <title>.
370 <status> is a number, with the following meaning:
372 0 - section heading with no chapter
373 1 - chapter heading
374 2 - book title
376 The TocItem object has convenience functions <is_chapter> and
377 <is_section>.
379 <chapter> is twiki name of the chapter.
381 <title> is a human readable title for the chapter. It is likely to
382 differ from the title given in the chapter's <h1> heading.
384 self.toc = []
385 for status, chapter, title in twiki_wrapper.toc_iterator(self.server, self.book):
386 self.toc.append(TocItem(status, chapter, title))
387 self.notify_watcher()
389 def load_book(self):
390 """Fetch and parse the raw html of the book. Links in the
391 document will be made absolute."""
392 html = twiki_wrapper.get_book_html(self.server, self.book, self.dir)
393 self.save_tempfile('raw.html', html)
395 self.tree = lxml.html.document_fromstring(html)
396 self.tree.make_links_absolute(config.BOOK_URL % (self.server, self.book))
397 self.headings = [x for x in self.tree.cssselect('h1')]
398 if self.headings:
399 self.headings[0].set('class', "first-heading")
400 for h1 in self.headings:
401 h1.title = h1.text_content().strip()
402 self.notify_watcher()
404 def load(self):
405 """Wrapper around all necessary load methods."""
406 self.load_book()
407 self.load_toc()
409 def make_contents(self):
410 """Generate HTML containing the table of contents. This can
411 only be done after the main PDF has been made."""
412 header = '<h1>Table of Contents</h1><table class="toc">\n'
413 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
414 '<td class="pagenumber">%s</td></tr>\n')
415 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
416 footer = '\n</table>'
418 contents = []
420 chapter = 1
421 page_num = 1
422 subsections = [] # for the subsection heading pages.
424 outline_contents = iter(self.outline_contents)
425 headings = iter(self.headings)
427 for t in self.toc:
428 if t.is_chapter():
429 try:
430 h1 = headings.next()
431 except StopIteration:
432 log("heading not found for %s (previous h1 missing?). Stopping" % t)
433 break
434 try:
435 h1_text, level, page_num = outline_contents.next()
436 except StopIteration:
437 log("contents data not found for %s. Stopping" % t)
438 break
439 log("%r %r" % (h1.title, h1_text))
440 contents.append(row_tmpl % (chapter, h1.title, page_num))
441 chapter += 1
442 elif t.is_section():
443 contents.append(section_tmpl % t.title)
444 else:
445 log("mystery TOC item: %s" % t)
447 doc = header + '\n'.join(contents) + footer
448 self.notify_watcher()
449 return doc
451 def add_section_titles(self):
452 """Add any section heading pages that the TOC.txt file
453 specifies. These are sub-book, super-chapter groupings.
455 Also add initial numbers to chapters.
457 headings = iter(self.headings)
458 chapter = 1
459 section = None
461 for t in self.toc:
462 if t.is_chapter() and section is not None:
463 try:
464 h1 = headings.next()
465 except StopIteration:
466 log("heading not found for %s (previous h1 missing?)" % t)
467 break
468 item = h1.makeelement('div', Class='chapter')
469 log(h1.title, debug='HTMLGEN')
470 item.text = h1.title
471 _add_initial_number(item, chapter)
473 section.append(item)
475 if not section_placed:
476 log("placing section", debug='HTMLGEN')
477 h1.addprevious(section)
478 section_placed = True
479 else:
480 log("NOT placing section", debug='HTMLGEN')
482 #put a bold number at the beginning of the h1.
483 _add_initial_number(h1, chapter)
484 chapter += 1
486 elif t.is_section():
487 section = self.tree.makeelement('div', Class="subsection")
488 # section Element complains when you try to ask it whether it
489 # has been placed (though it does know)
490 section_placed = False
491 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
492 heading.set("Class", "subsection-heading")
493 section.append(heading)
495 self.notify_watcher()
498 def add_css(self, css=None, mode='book'):
499 """If css looks like a url, use it as a stylesheet link.
500 Otherwise it is the CSS itself, which is saved to a temporary file
501 and linked to."""
502 log("css is %r" % css)
503 htmltree = self.tree
504 if css is None or not css.strip():
505 defaults = config.SERVER_DEFAULTS[self.server]
506 url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
507 elif not re.match(r'^http://\S+$', css):
508 fn = self.save_tempfile('objavi.css', css)
509 url = 'file://' + fn
510 else:
511 url = css
512 #XXX for debugging and perhaps sensible anyway
513 #url = url.replace('file:///home/douglas/objavi2', '')
516 #find the head -- it's probably first child but lets not assume.
517 for child in htmltree:
518 if child.tag == 'head':
519 head = child
520 break
521 else:
522 head = htmltree.makeelement('head')
523 htmltree.insert(0, head)
525 link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
526 self.css_url = url
527 self.notify_watcher()
528 return url
530 def set_title(self, title=None):
531 """If a string is supplied, it becomes the book's title.
532 Otherwise a guess is made."""
533 if title:
534 self.title = title
535 else:
536 titles = [x.text_content() for x in self.tree.cssselect('title')]
537 if titles and titles[0]:
538 self.title = titles[0]
539 else:
540 #oh well
541 self.title = 'A Manual About ' + self.book
542 return self.title
544 def _read_localised_template(self, template, fallbacks=['en']):
545 """Try to get the template in the approriate language, otherwise in english."""
546 for lang in [self.lang] + fallbacks:
547 try:
548 fn = template % (lang)
549 f = open(fn)
550 break
551 except IOError, e:
552 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
553 log(e)
554 template = f.read()
555 f.close()
556 return template
558 def compose_inside_cover(self):
559 """create the markup for the preamble inside cover."""
560 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
562 if self.isbn:
563 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
564 else:
565 isbn_text = ''
567 return template % {'date': time.strftime('%Y-%m-%d'),
568 'isbn': isbn_text,
569 'license': self.license,
573 def compose_end_matter(self):
574 """create the markup for the end_matter inside cover. If
575 self.isbn is not set, the html will result in a pdf that
576 spills onto two pages.
578 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
580 d = {'css_url': self.css_url,
581 'title': self.title
584 if self.isbn:
585 d['inside_cover_style'] = ''
586 else:
587 d['inside_cover_style'] = 'page-break-after: always'
589 return template % d
594 def spawn_x(self):
595 """Start an Xvfb instance, using a new server number. A
596 reference to it is stored in self.xvfb, which is used to kill
597 it when the pdf is done.
599 Note that Xvfb doesn't interact well with dbus which is
600 present on modern desktops.
602 #Find an unused server number (in case two cgis are running at once)
603 while True:
604 servernum = random.randrange(50, 500)
605 if not os.path.exists('/tmp/.X%s-lock' % servernum):
606 break
608 self.xserver_no = ':%s' % servernum
610 authfile = self.filepath('Xauthority')
611 os.environ['XAUTHORITY'] = authfile
613 #mcookie(1) eats into /dev/random, so avoid that
614 from hashlib import md5
615 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
616 mcookie = m.hexdigest()
618 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
620 self.xvfb = Popen(['Xvfb', self.xserver_no,
621 '-screen', '0', '1024x768x24',
622 '-pixdepths', '32',
623 #'-blackpixel', '0',
624 #'-whitepixel', str(2 ** 24 -1),
625 #'+extension', 'Composite',
626 '-dpi', '96',
627 '-kb',
628 '-nolisten', 'tcp',
631 # We need to wait a bit before the Xvfb is ready. but the
632 # downloads are so slow that that probably doesn't matter
634 self.xvfb_ready_time = time.time() + 2
636 os.environ['DISPLAY'] = self.xserver_no
637 log(self.xserver_no)
639 def wait_for_xvfb(self):
640 """wait until a previously set time before continuing. This
641 is so Xvfb has time to properly start."""
642 if hasattr(self, 'xvfb'):
643 d = self.xvfb_ready_time - time.time()
644 if d > 0:
645 time.sleep(d)
646 self.notify_watcher()
648 def cleanup_x(self):
649 """Try very hard to kill off Xvfb. In addition to killing
650 this instance's xvfb, occasionally (randomly) search for
651 escaped Xvfb instances and kill those too."""
652 if not hasattr(self, 'xvfb'):
653 return
654 check_call(['xauth', 'remove', self.xserver_no])
655 p = self.xvfb
656 log("trying to kill Xvfb %s" % p.pid)
657 os.kill(p.pid, 15)
658 for i in range(10):
659 if p.poll() is not None:
660 log("%s died with %s" % (p.pid, p.poll()))
661 break
662 log("%s not dead yet" % p.pid)
663 time.sleep(0.2)
664 else:
665 log("Xvfb would not die! kill -9! kill -9!")
666 os.kill(p.pid, 9)
668 if random.random() < 0.1:
669 # occasionally kill old xvfbs and soffices, if there are any.
670 self.kill_old_processes()
672 def kill_old_processes(self):
673 """Sometimes, despite everything, Xvfb or soffice instances
674 hang around well after they are wanted -- for example if the
675 cgi process dies particularly badly. So kill them if they have
676 been running for a long time."""
677 log("running kill_old_processes")
678 p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
679 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
680 data = p.communicate()[0].strip()
681 if data:
682 lines = data.split('\n')
683 pids = []
684 for line in lines:
685 log('dealing with ps output "%s"' % line)
686 try:
687 pid, days, hours, minutes, seconds \
688 = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
689 except AttributeError:
690 log("Couldn't parse that line!")
691 # 50 minutes should be enough xvfb time for anyone
692 if days or hours or int(minutes) > 50:
693 pid = int(pid)
694 log("going to kill pid %s" % pid)
695 os.kill(pid, 15)
696 pids.append(pid)
698 time.sleep(1.0)
699 for pid in pids:
700 #try again in case any are lingerers
701 try:
702 os.kill(int(pid), 9)
703 except OSError, e:
704 log('PID %s seems dead (re-kill gives %s)' % (pid, e))
705 continue
706 log('killing %s with -9' % pid)
707 self.notify_watcher()
709 def cleanup(self):
710 self.cleanup_x()
711 if not config.KEEP_TEMP_FILES:
712 for fn in os.listdir(self.workdir):
713 os.remove(os.path.join(self.workdir, fn))
714 os.rmdir(self.workdir)
715 else:
716 log("NOT removing '%s', containing the following files:" % self.workdir)
717 log(*os.listdir(self.workdir))
719 self.notify_watcher()
723 def fetch_zip(server, book, project, save=False):
724 from urllib2 import urlopen
725 settings = config.SERVER_DEFAULTS[server]
726 interface = settings['interface']
727 if interface not in ('Booki', 'TWiki'):
728 raise NotImplementedError("Can't handle '%s' interface" % interface)
729 if interface == 'Booki':
730 url = config.BOOKI_ZIP_URL % {'server': server, 'project': project, 'book':book}
731 else:
732 url = config.TWIKI_GATEWAY_URL % (HTTP_HOST, server, book)
733 log('fetching zip from %s'% url)
734 f = urlopen(url)
735 blob = f.read()
736 f.close()
737 if save:
738 zipname = make_book_name(book, server, '.zip')
739 f = open('%s/%s' % (config.BOOKI_BOOK_DIR, zipname), 'w')
740 f.write(blob)
741 f.close()
742 return blob
744 class ZipBook(Book):
745 """A Book based on a booki-zip file. Depending how out-of-date
746 this docstring is, some of the parent's methods will not work.
748 def __init__(self, server, book, bookname, project=None, **kwargs):
749 log("starting zipbook with", server, book, project, kwargs)
750 blob = fetch_zip(server, book, project, save=True)
751 f = StringIO(blob)
752 self.bookname = bookname
753 self.store = zipfile.ZipFile(f, 'r')
754 self.info = json.loads(self.store.read('info.json'))
755 metadata = self.info['metadata']
757 if server == config.LOCALHOST:
758 server = metadata.get('fm:server', server)
759 book = metadata.get('fm:book', book)
761 Book.__init__(self, book, server, bookname, **kwargs)
762 if 'title' in metadata:
763 self.set_title(metadata['title'])
764 self.project = project
765 self.epubfile = self.filepath(bookname)
767 def make_epub(self, use_cache=False):
768 """Make an epub version of the book, using Mike McCabe's
769 epub module for the Internet Archive."""
770 ebook = ia_epub.Book(self.epubfile, content_dir='')
771 manifest = self.info['manifest']
772 metadata = self.info['metadata']
773 toc = self.info['TOC']
774 spine = self.info['spine']
776 #manifest
777 filemap = {} #reformulated manifest for NCX
778 for ID in manifest:
779 fn, mediatype = manifest[ID]
780 #work around bug http://booki-dev.flossmanuals.net/ticket/46
781 if ID.endswith('.html'):
782 ID = ID[:-5]
783 log('took ".html" off "%s"' % ID)
785 oldfn = fn
786 log(ID, fn, mediatype)
787 content = self.store.read(fn)
788 if mediatype == 'text/html':
789 log('CONVERTING')
790 #convert to application/xhtml+xml
791 c = EpubChapter(self.server, self.book, ID, content,
792 use_cache=use_cache)
793 c.remove_bad_tags()
794 c.prepare_for_epub()
795 content = c.as_xhtml()
796 fn = fn[:-5] + '.xhtml'
797 mediatype = 'application/xhtml+xml'
798 if mediatype == 'application/xhtml+xml':
799 filemap[oldfn] = fn
800 #log(fn, mediatype)
802 info = {'id': ID.encode('utf-8'),
803 'href': fn.encode('utf-8'),
804 'media-type': mediatype.encode('utf-8')}
805 ebook.add_content(info, content)
807 #toc
808 ncx = epub_utils.make_ncx(toc, metadata, filemap)
809 ebook.add(ebook.content_dir + 'toc.ncx', ncx)
811 #spine
812 for ID in spine:
813 ebook.add_spine_item({'idref': ID})
815 #metadata -- no use of attributes (yet)
816 # and fm: metadata disappears for now
817 dcns = config.DCNS
818 meta_info_items = []
819 has_authors = False
820 for k, v in metadata.iteritems():
821 if k.startswith('fm:'):
822 continue
823 meta_info_items.append({'item': dcns + k,
824 'text': v}
826 if k == 'creator':
827 has_authors = True
829 if not has_authors and config.CLAIM_UNAUTHORED:
830 meta_info_items.append({'item': dcns + 'creator',
831 'text': 'The Contributors'})
833 #copyright
834 authors = sorted(self.info['copyright'])
835 for a in authors:
836 meta_info_items.append({'item': dcns + 'contributor',
837 'text': a}
839 if not has_authors:
840 meta_info_items.append({'item': dcns + 'rights',
841 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
844 tree_str = ia_epub.make_opf(meta_info_items,
845 ebook.manifest_items,
846 ebook.spine_items,
847 ebook.guide_items,
848 ebook.cover_id)
849 ebook.add(ebook.content_dir + 'content.opf', tree_str)
850 ebook.z.close()
853 def publish_s3(self):
854 """Push the book's epub to archive.org, using S3."""
855 #XXX why only epub?
856 secrets = {}
857 for x in ('S3_SECRET', 'S3_ACCESSKEY'):
858 fn = getattr(config, x)
859 f = open(fn)
860 secrets[x] = f.read().strip()
861 f.close()
863 log(secrets)
864 now = time.strftime('%F')
865 s3url = 'http://s3.us.archive.org/booki-%s-%s/%s' % (self.project, self.book, self.bookname)
866 detailsurl = 'http://archive.org/details/booki-%s-%s' % (self.project, self.book)
867 headers = [
868 'x-amz-auto-make-bucket:1',
869 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
870 'x-archive-meta-mediatype:texts',
871 'x-archive-meta-collection:opensource',
872 'x-archive-meta-title:%s' %(self.book,),
873 'x-archive-meta-date:%s' % (now,),
874 'x-archive-meta-creator:FLOSS Manuals Contributors',
877 if self.license in config.LICENSES:
878 headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
880 argv = ['curl', '--location',]
881 for h in headers:
882 argv.extend(('--header', h))
883 argv.extend(('--upload-file', self.epubfile, s3url,))
885 log(argv)
886 check_call(argv)
887 return detailsurl
889 def publish_epub(self):
890 self.epubfile = shift_file(self.epubfile, config.EPUB_DIR)
891 return self.epubfile