objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 from subprocess import Popen, check_call, PIPE
  28 from cStringIO import StringIO
  29 import zipfile
  30 import traceback
  31 try:
  32     import simplejson as json
  33 except ImportError:
  34     import json
  35
  36 import lxml, lxml.html, lxml.etree
  37
  38 from objavi import config, twiki_wrapper, epub_utils
  39 from objavi.cgi_utils import log, run, shift_file
  40 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
  41
  42 from iarchive import epub as ia_epub
  43 from booki.xhtml_utils import EpubChapter
  44
  45 TMPDIR = os.path.abspath(config.TMPDIR)
  46 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  47 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  48 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  49
  50 def make_book_name(book, server, suffix='.pdf'):
  51     lang = config.SERVER_DEFAULTS.get(server, config.SERVER_DEFAULTS[config.DEFAULT_SERVER])['lang']
  52     book = ''.join(x for x in book if x.isalnum())
  53     return '%s-%s-%s%s' % (book, lang,
  54                            time.strftime('%Y.%m.%d-%H.%M.%S'),
  55                            suffix)
  56
  57 def _add_initial_number(e, n):
  58     """Put a styled chapter number n at the beginning of element e."""
  59     initial = e.makeelement("strong", Class="initial")
  60     e.insert(0, initial)
  61     initial.tail = ' '
  62     if e.text is not None:
  63         initial.tail += e.text
  64     e.text = ''
  65     initial.text = "%s." % n
  66
  67
  68 class TocItem(object):
  69     """This makes sense of the tuples from TOC.txt files"""
  70     def __init__(self, status, chapter, title):
  71         # status is
  72         #  0 - section heading with no chapter
  73         #  1 - chapter heading
  74         #  2 - book title
  75         #
  76         # chapter is twiki name of the chapter
  77         # title is a human readable name of the chapter.
  78         self.status = status
  79         self.chapter = chapter
  80         self.title = title
  81
  82     def is_chapter(self):
  83         return self.status == '1'
  84
  85     def is_section(self):
  86         return self.status == '0'
  87
  88     def is_title(self):
  89         return self.status == '2'
  90
  91     def __str__(self):
  92         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
  93
  94
  95 class Book(object):
  96     page_numbers = 'latin'
  97     preamble_page_numbers = 'roman'
  98
  99     def notify_watcher(self, message=None):
 100         if self.watcher:
 101             if  message is None:
 102                 #message is the name of the caller
 103                 message = traceback.extract_stack(None, 2)[0][2]
 104             log("notify_watcher called with '%s'" % message)
 105             self.watcher(message)
 106
 107     def __enter__(self):
 108         return self
 109
 110     def __exit__(self, exc_type, exc_value, traceback):
 111         self.cleanup()
 112         #could deal with exceptions here and return true
 113
 114     def __init__(self, book, server, bookname,
 115                  page_settings=None, watcher=None, isbn=None,
 116                  license=config.DEFAULT_LICENSE):
 117         log("*** Starting new book %s ***" % bookname)
 118         self.book = book
 119         self.server = server
 120         self.watcher = watcher
 121         self.isbn = isbn
 122         self.license = license
 123         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 124         os.chmod(self.workdir, 0755)
 125         defaults = config.SERVER_DEFAULTS[server]
 126         self.lang = defaults['lang']
 127         self.dir  = defaults['dir']
 128
 129         self.body_html_file = self.filepath('body.html')
 130         self.body_pdf_file = self.filepath('body.pdf')
 131         self.preamble_html_file = self.filepath('preamble.html')
 132         self.preamble_pdf_file = self.filepath('preamble.pdf')
 133         self.tail_html_file = self.filepath('tail.html')
 134         self.tail_pdf_file = self.filepath('tail.pdf')
 135         self.isbn_pdf_file = None
 136         self.pdf_file = self.filepath('final.pdf')
 137         self.body_odt_file = self.filepath('body.odt')
 138
 139         self.publish_name = bookname
 140         self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
 141         self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
 142
 143         if page_settings is not None:
 144             self.maker = PageSettings(**page_settings)
 145
 146         self.notify_watcher()
 147
 148     if config.TRY_BOOK_CLEANUP_ON_DEL:
 149         #Dont even define __del__ if it is not used.
 150         _try_cleanup_on_del = True
 151         def __del__(self):
 152             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 153                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 154                 self.cleanup()
 155
 156     def filepath(self, fn):
 157         return os.path.join(self.workdir, fn)
 158
 159     def save_data(self, fn, data):
 160         """Save without tripping up on unicode"""
 161         if isinstance(data, unicode):
 162             data = data.encode('utf8', 'ignore')
 163         f = open(fn, 'w')
 164         f.write(data)
 165         f.close()
 166
 167     def save_tempfile(self, fn, data):
 168         """Save the data in a temporary directory that will be cleaned
 169         up when all is done.  Return the absolute file path."""
 170         fn = self.filepath(fn)
 171         self.save_data(fn, data)
 172         return fn
 173
 174     def make_oo_doc(self):
 175         """Make an openoffice document, using the html2odt script."""
 176         self.wait_for_xvfb()
 177         html_text = lxml.etree.tostring(self.tree, method="html")
 178         self.save_data(self.body_html_file, html_text)
 179         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 180         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 181         os.rename(self.body_odt_file, self.publish_file)
 182         self.notify_watcher()
 183
 184     def extract_pdf_outline(self):
 185         self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
 186         for x in self.outline_contents:
 187             log(x)
 188         self.notify_watcher()
 189         return number_of_pages
 190
 191     def make_body_pdf(self):
 192         """Make a pdf of the HTML, using webkit"""
 193         #1. Save the html
 194         html_text = lxml.etree.tostring(self.tree, method="html")
 195         self.save_data(self.body_html_file, html_text)
 196
 197         #2. Make a pdf of it
 198         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
 199         self.notify_watcher('generate_pdf')
 200
 201         n_pages = self.extract_pdf_outline()
 202
 203         log ("found %s pages in pdf" % n_pages)
 204         #4. resize pages, shift gutters, even pages
 205         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 206         self.notify_watcher('reshape_pdf')
 207
 208         #5 add page numbers
 209         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 210                               numbers=self.page_numbers)
 211         self.notify_watcher("number_pdf")
 212         self.notify_watcher()
 213
 214     def make_preamble_pdf(self):
 215         contents = self.make_contents()
 216         inside_cover_html = self.compose_inside_cover()
 217         html = ('<html dir="%s"><head>\n'
 218                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 219                 '<link rel="stylesheet" href="%s" />\n'
 220                 '</head>\n<body>\n'
 221                 '<h1 class="frontpage">%s</h1>'
 222                 '%s\n'
 223                 '<div class="contents">%s</div>\n'
 224                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 225                 '<!--%s--></div></body></html>'
 226                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 227                      contents, self.title)
 228         self.save_data(self.preamble_html_file, html)
 229
 230         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 231
 232         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 233
 234         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 235                             numbers=self.preamble_page_numbers,
 236                             number_start=-2)
 237
 238         self.notify_watcher()
 239
 240     def make_end_matter_pdf(self):
 241         """Make an inside back cover and a back cover.  If there is an
 242         isbn number its barcode will be put on the back cover."""
 243         if self.isbn:
 244             self.isbn_pdf_file = self.filepath('isbn.pdf')
 245             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 246             self.notify_watcher('make_barcode_pdf')
 247
 248         self.save_data(self.tail_html_file, self.compose_end_matter())
 249         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 250
 251         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 252                                centre_end=True, even_pages=False)
 253         self.notify_watcher()
 254
 255     def make_book_pdf(self):
 256         """A convenient wrapper of a few necessary steps"""
 257         # now the Xvfb server is needed. make sure it has had long enough to get going
 258         self.wait_for_xvfb()
 259         self.make_body_pdf()
 260         self.make_preamble_pdf()
 261         self.make_end_matter_pdf()
 262
 263         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 264                     self.body_pdf_file, self.tail_pdf_file,
 265                     self.isbn_pdf_file)
 266
 267         self.notify_watcher('concatenated_pdfs')
 268
 269
 270     def make_simple_pdf(self, mode):
 271         """Make a simple pdf document without contents or separate
 272         title page.  This is used for multicolumn newspapers and for
 273         web-destined pdfs."""
 274         self.wait_for_xvfb()
 275         #0. Add heading to begining of html
 276         body = list(self.tree.cssselect('body'))[0]
 277         e = body.makeelement('h1', {'id': 'book-title'})
 278         e.text = self.title
 279         body.insert(0, e)
 280         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 281         e.addnext(intro)
 282
 283         #0.5 adjust parameters to suit the particular kind of output
 284         if mode == 'web':
 285             self.maker.gutter = 0
 286
 287         #1. Save the html
 288         html_text = lxml.etree.tostring(self.tree, method="html")
 289         self.save_data(self.body_html_file, html_text)
 290
 291         #2. Make a pdf of it (direct to to final pdf)
 292         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
 293         self.notify_watcher('generate_pdf')
 294         n_pages = count_pdf_pages(self.pdf_file)
 295
 296         if mode != 'web':
 297             #3. resize pages and shift gutters.
 298             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 299             self.notify_watcher('reshape_pdf')
 300
 301             #4. add page numbers
 302             self.maker.number_pdf(self.pdf_file, n_pages,
 303                                   dir=self.dir, numbers=self.page_numbers)
 304             self.notify_watcher("number_pdf")
 305         self.notify_watcher()
 306
 307
 308     def rotate180(self):
 309         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 310         presses."""
 311         rotated = self.filepath('final-rotate.pdf')
 312         unrotated = self.filepath('final-pre-rotate.pdf')
 313         #leave the unrotated pdf intact at first, in case of error.
 314         rotate_pdf(self.pdf_file, rotated)
 315         os.rename(self.pdf_file, unrotated)
 316         os.rename(rotated, self.pdf_file)
 317         self.notify_watcher()
 318
 319     def publish_pdf(self):
 320         """Move the finished PDF to its final resting place"""
 321         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 322         os.rename(self.pdf_file, self.publish_file)
 323         self.notify_watcher()
 324
 325     def get_twiki_metadata(self):
 326         """Get information about a twiki book (as much as is easy and useful)."""
 327         if not hasattr(self, 'toc'):
 328             self.load_toc()
 329
 330         title_map = {}
 331         authors = {}
 332         meta = {
 333             'language': self.lang,
 334             'identifier': 'http://%s/epub/%s/%s' %(self.server, self.book, time.strftime('%Y.%m.%d-%H.%M.%S')),
 335             'publisher': 'FLOSS Manuals http://flossmanuals.net',
 336             'creator': 'The Contributors',
 337             'date': time.strftime('%Y-%m-%d'),
 338             'fm:server': self.server,
 339             'fm:book': self.book,
 340             'title': self.book,
 341             }
 342         spine = []
 343         toc = []
 344         section = toc
 345         for t in self.toc:
 346             if t.is_chapter():
 347                 spine.append(t.chapter)
 348                 section.append((t.title, t.chapter + '.html')) #XXX
 349                 title_map[t.title] = t.chapter
 350             elif t.is_section():
 351                 section = []
 352                 toc.append([[t.title, None], section])
 353             elif t.is_title():
 354                 meta['title'] = t.title
 355
 356         author_copyright, chapter_copyright = twiki_wrapper.get_book_copyright(self.server, self.book, title_map)
 357
 358         return {
 359             'metadata': meta,
 360             'TOC': toc,
 361             'spine': spine,
 362             'copyright': author_copyright,
 363             #'chapter_copyright': chapter_copyright,
 364         }
 365
 366     def load_toc(self):
 367         """From the TOC.txt file create a list of TocItems with
 368         the attributes <status>, <chapter>, and <title>.
 369
 370         <status> is a number, with the following meaning:
 371
 372               0 - section heading with no chapter
 373               1 - chapter heading
 374               2 - book title
 375
 376         The TocItem object has convenience functions <is_chapter> and
 377         <is_section>.
 378
 379         <chapter> is twiki name of the chapter.
 380
 381         <title> is a human readable title for the chapter.  It is likely to
 382         differ from the title given in the chapter's <h1> heading.
 383         """
 384         self.toc = []
 385         for status, chapter, title in twiki_wrapper.toc_iterator(self.server, self.book):
 386             self.toc.append(TocItem(status, chapter, title))
 387         self.notify_watcher()
 388
 389     def load_book(self):
 390         """Fetch and parse the raw html of the book.  Links in the
 391         document will be made absolute."""
 392         html = twiki_wrapper.get_book_html(self.server, self.book, self.dir)
 393         self.save_tempfile('raw.html', html)
 394
 395         self.tree = lxml.html.document_fromstring(html)
 396         self.tree.make_links_absolute(config.BOOK_URL % (self.server, self.book))
 397         self.headings = [x for x in self.tree.cssselect('h1')]
 398         if self.headings:
 399             self.headings[0].set('class', "first-heading")
 400         for h1 in self.headings:
 401             h1.title = h1.text_content().strip()
 402         self.notify_watcher()
 403
 404     def load(self):
 405         """Wrapper around all necessary load methods."""
 406         self.load_book()
 407         self.load_toc()
 408
 409     def make_contents(self):
 410         """Generate HTML containing the table of contents.  This can
 411         only be done after the main PDF has been made."""
 412         header = '<h1>Table of Contents</h1><table class="toc">\n'
 413         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 414                     '<td class="pagenumber">%s</td></tr>\n')
 415         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 416         footer = '\n</table>'
 417
 418         contents = []
 419
 420         chapter = 1
 421         page_num = 1
 422         subsections = [] # for the subsection heading pages.
 423
 424         outline_contents = iter(self.outline_contents)
 425         headings = iter(self.headings)
 426
 427         for t in self.toc:
 428             if t.is_chapter():
 429                 try:
 430                     h1 = headings.next()
 431                 except StopIteration:
 432                     log("heading not found for %s (previous h1 missing?). Stopping" % t)
 433                     break
 434                 try:
 435                     h1_text, level, page_num = outline_contents.next()
 436                 except StopIteration:
 437                     log("contents data not found for %s. Stopping" % t)
 438                     break
 439                 log("%r %r" % (h1.title, h1_text))
 440                 contents.append(row_tmpl % (chapter, h1.title, page_num))
 441                 chapter += 1
 442             elif t.is_section():
 443                 contents.append(section_tmpl % t.title)
 444             else:
 445                 log("mystery TOC item: %s" % t)
 446
 447         doc = header + '\n'.join(contents) + footer
 448         self.notify_watcher()
 449         return doc
 450
 451     def add_section_titles(self):
 452         """Add any section heading pages that the TOC.txt file
 453         specifies.  These are sub-book, super-chapter groupings.
 454
 455         Also add initial numbers to chapters.
 456         """
 457         headings = iter(self.headings)
 458         chapter = 1
 459         section = None
 460
 461         for t in self.toc:
 462             if t.is_chapter() and section is not None:
 463                 try:
 464                     h1 = headings.next()
 465                 except StopIteration:
 466                     log("heading not found for %s (previous h1 missing?)" % t)
 467                     break
 468                 item = h1.makeelement('div', Class='chapter')
 469                 log(h1.title, debug='HTMLGEN')
 470                 item.text = h1.title
 471                 _add_initial_number(item, chapter)
 472
 473                 section.append(item)
 474
 475                 if not section_placed:
 476                     log("placing section", debug='HTMLGEN')
 477                     h1.addprevious(section)
 478                     section_placed = True
 479                 else:
 480                     log("NOT placing section", debug='HTMLGEN')
 481
 482                 #put a bold number at the beginning of the h1.
 483                 _add_initial_number(h1, chapter)
 484                 chapter += 1
 485
 486             elif t.is_section():
 487                 section = self.tree.makeelement('div', Class="subsection")
 488                 # section Element complains when you try to ask it whether it
 489                 # has been placed (though it does know)
 490                 section_placed = False
 491                 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
 492                 heading.set("Class", "subsection-heading")
 493                 section.append(heading)
 494
 495         self.notify_watcher()
 496
 497
 498     def add_css(self, css=None, mode='book'):
 499         """If css looks like a url, use it as a stylesheet link.
 500         Otherwise it is the CSS itself, which is saved to a temporary file
 501         and linked to."""
 502         log("css is %r" % css)
 503         htmltree = self.tree
 504         if css is None or not css.strip():
 505             defaults = config.SERVER_DEFAULTS[self.server]
 506             url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
 507         elif not re.match(r'^http://\S+$', css):
 508             fn = self.save_tempfile('objavi.css', css)
 509             url = 'file://' + fn
 510         else:
 511             url = css
 512         #XXX for debugging and perhaps sensible anyway
 513         #url = url.replace('file:///home/douglas/objavi2', '')
 514
 515
 516         #find the head -- it's probably first child but lets not assume.
 517         for child in htmltree:
 518             if child.tag == 'head':
 519                 head = child
 520                 break
 521         else:
 522             head = htmltree.makeelement('head')
 523             htmltree.insert(0, head)
 524
 525         link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 526         self.css_url = url
 527         self.notify_watcher()
 528         return url
 529
 530     def set_title(self, title=None):
 531         """If a string is supplied, it becomes the book's title.
 532         Otherwise a guess is made."""
 533         if title:
 534             self.title = title
 535         else:
 536             titles = [x.text_content() for x in self.tree.cssselect('title')]
 537             if titles and titles[0]:
 538                 self.title = titles[0]
 539             else:
 540                 #oh well
 541                 self.title = 'A Manual About ' + self.book
 542         return self.title
 543
 544     def _read_localised_template(self, template, fallbacks=['en']):
 545         """Try to get the template in the approriate language, otherwise in english."""
 546         for lang in [self.lang] + fallbacks:
 547             try:
 548                 fn = template % (lang)
 549                 f = open(fn)
 550                 break
 551             except IOError, e:
 552                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 553                 log(e)
 554         template = f.read()
 555         f.close()
 556         return template
 557
 558     def compose_inside_cover(self):
 559         """create the markup for the preamble inside cover."""
 560         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 561
 562         if self.isbn:
 563             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 564         else:
 565             isbn_text = ''
 566
 567         return template % {'date': time.strftime('%Y-%m-%d'),
 568                            'isbn': isbn_text,
 569                            'license': self.license,
 570                            }
 571
 572
 573     def compose_end_matter(self):
 574         """create the markup for the end_matter inside cover.  If
 575         self.isbn is not set, the html will result in a pdf that
 576         spills onto two pages.
 577         """
 578         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 579
 580         d = {'css_url': self.css_url,
 581              'title': self.title
 582              }
 583
 584         if self.isbn:
 585             d['inside_cover_style'] = ''
 586         else:
 587             d['inside_cover_style'] = 'page-break-after: always'
 588
 589         return template % d
 590
 591
 592
 593
 594     def spawn_x(self):
 595         """Start an Xvfb instance, using a new server number.  A
 596         reference to it is stored in self.xvfb, which is used to kill
 597         it when the pdf is done.
 598
 599         Note that Xvfb doesn't interact well with dbus which is
 600         present on modern desktops.
 601         """
 602         #Find an unused server number (in case two cgis are running at once)
 603         while True:
 604             servernum = random.randrange(50, 500)
 605             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 606                 break
 607
 608         self.xserver_no = ':%s' % servernum
 609
 610         authfile = self.filepath('Xauthority')
 611         os.environ['XAUTHORITY'] = authfile
 612
 613         #mcookie(1) eats into /dev/random, so avoid that
 614         from hashlib import md5
 615         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 616         mcookie = m.hexdigest()
 617
 618         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 619
 620         self.xvfb = Popen(['Xvfb', self.xserver_no,
 621                            '-screen', '0', '1024x768x24',
 622                            '-pixdepths', '32',
 623                            #'-blackpixel', '0',
 624                            #'-whitepixel', str(2 ** 24 -1),
 625                            #'+extension', 'Composite',
 626                            '-dpi', '96',
 627                            '-kb',
 628                            '-nolisten', 'tcp',
 629                            ])
 630
 631         # We need to wait a bit before the Xvfb is ready.  but the
 632         # downloads are so slow that that probably doesn't matter
 633
 634         self.xvfb_ready_time = time.time() + 2
 635
 636         os.environ['DISPLAY'] = self.xserver_no
 637         log(self.xserver_no)
 638
 639     def wait_for_xvfb(self):
 640         """wait until a previously set time before continuing.  This
 641         is so Xvfb has time to properly start."""
 642         if hasattr(self, 'xvfb'):
 643             d = self.xvfb_ready_time - time.time()
 644             if d > 0:
 645                 time.sleep(d)
 646                 self.notify_watcher()
 647
 648     def cleanup_x(self):
 649         """Try very hard to kill off Xvfb.  In addition to killing
 650         this instance's xvfb, occasionally (randomly) search for
 651         escaped Xvfb instances and kill those too."""
 652         if not hasattr(self, 'xvfb'):
 653             return
 654         check_call(['xauth', 'remove', self.xserver_no])
 655         p = self.xvfb
 656         log("trying to kill Xvfb %s" % p.pid)
 657         os.kill(p.pid, 15)
 658         for i in range(10):
 659             if p.poll() is not None:
 660                 log("%s died with %s" % (p.pid, p.poll()))
 661                 break
 662             log("%s not dead yet" % p.pid)
 663             time.sleep(0.2)
 664         else:
 665             log("Xvfb would not die! kill -9! kill -9!")
 666             os.kill(p.pid, 9)
 667
 668         if random.random() < 0.1:
 669             # occasionally kill old xvfbs and soffices, if there are any.
 670             self.kill_old_processes()
 671
 672     def kill_old_processes(self):
 673         """Sometimes, despite everything, Xvfb or soffice instances
 674         hang around well after they are wanted -- for example if the
 675         cgi process dies particularly badly. So kill them if they have
 676         been running for a long time."""
 677         log("running kill_old_processes")
 678         p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
 679                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 680         data = p.communicate()[0].strip()
 681         if data:
 682             lines = data.split('\n')
 683             pids = []
 684             for line in lines:
 685                 log('dealing with ps output "%s"' % line)
 686                 try:
 687                     pid, days, hours, minutes, seconds \
 688                          = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
 689                 except AttributeError:
 690                     log("Couldn't parse that line!")
 691                 # 50 minutes should be enough xvfb time for anyone
 692                 if days or hours or int(minutes) > 50:
 693                     pid = int(pid)
 694                     log("going to kill pid %s" % pid)
 695                     os.kill(pid, 15)
 696                     pids.append(pid)
 697
 698             time.sleep(1.0)
 699             for pid in pids:
 700                 #try again in case any are lingerers
 701                 try:
 702                     os.kill(int(pid), 9)
 703                 except OSError, e:
 704                     log('PID %s seems dead (re-kill gives %s)' % (pid, e))
 705                     continue
 706                 log('killing %s with -9' % pid)
 707         self.notify_watcher()
 708
 709     def cleanup(self):
 710         self.cleanup_x()
 711         if not config.KEEP_TEMP_FILES:
 712             for fn in os.listdir(self.workdir):
 713                 os.remove(os.path.join(self.workdir, fn))
 714             os.rmdir(self.workdir)
 715         else:
 716             log("NOT removing '%s', containing the following files:" % self.workdir)
 717             log(*os.listdir(self.workdir))
 718
 719         self.notify_watcher()
 720
 721
 722
 723 def fetch_zip(server, book, project, save=False):
 724     from urllib2 import urlopen
 725     settings = config.SERVER_DEFAULTS[server]
 726     interface = settings['interface']
 727     if interface not in ('Booki', 'TWiki'):
 728         raise NotImplementedError("Can't handle '%s' interface" % interface)
 729     if interface == 'Booki':
 730         url = config.BOOKI_ZIP_URL  % {'server': server, 'project': project, 'book':book}
 731     else:
 732         url = config.TWIKI_GATEWAY_URL % (HTTP_HOST, server, book)
 733     log('fetching zip from %s'% url)
 734     f = urlopen(url)
 735     blob = f.read()
 736     f.close()
 737     if save:
 738         zipname = make_book_name(book, server, '.zip')
 739         f = open('%s/%s' % (config.BOOKI_BOOK_DIR, zipname), 'w')
 740         f.write(blob)
 741         f.close()
 742     return blob
 743
 744 class ZipBook(Book):
 745     """A Book based on a booki-zip file.  Depending how out-of-date
 746     this docstring is, some of the parent's methods will not work.
 747     """
 748     def __init__(self, server, book, bookname, project=None, **kwargs):
 749         log("starting zipbook with", server, book, project, kwargs)
 750         blob = fetch_zip(server, book, project, save=True)
 751         f = StringIO(blob)
 752         self.bookname = bookname
 753         self.store = zipfile.ZipFile(f, 'r')
 754         self.info = json.loads(self.store.read('info.json'))
 755         metadata = self.info['metadata']
 756
 757         if server == config.LOCALHOST:
 758             server = metadata.get('fm:server', server)
 759             book = metadata.get('fm:book', book)
 760
 761         Book.__init__(self, book, server, bookname, **kwargs)
 762         if 'title' in metadata:
 763             self.set_title(metadata['title'])
 764         self.project = project
 765         self.epubfile = self.filepath(bookname)
 766
 767     def make_epub(self, use_cache=False):
 768         """Make an epub version of the book, using Mike McCabe's
 769         epub module for the Internet Archive."""
 770         ebook = ia_epub.Book(self.epubfile, content_dir='')
 771         manifest = self.info['manifest']
 772         metadata = self.info['metadata']
 773         toc = self.info['TOC']
 774         spine = self.info['spine']
 775
 776         #manifest
 777         filemap = {} #reformulated manifest for NCX
 778         for ID in manifest:
 779             fn, mediatype = manifest[ID]
 780             #work around bug http://booki-dev.flossmanuals.net/ticket/46
 781             if ID.endswith('.html'):
 782                 ID = ID[:-5]
 783                 log('took ".html" off "%s"' % ID)
 784
 785             oldfn = fn
 786             log(ID, fn, mediatype)
 787             content = self.store.read(fn)
 788             if mediatype == 'text/html':
 789                 log('CONVERTING')
 790                 #convert to application/xhtml+xml
 791                 c = EpubChapter(self.server, self.book, ID, content,
 792                                 use_cache=use_cache)
 793                 c.remove_bad_tags()
 794                 c.prepare_for_epub()
 795                 content = c.as_xhtml()
 796                 fn = fn[:-5] + '.xhtml'
 797                 mediatype = 'application/xhtml+xml'
 798             if mediatype == 'application/xhtml+xml':
 799                 filemap[oldfn] = fn
 800                 #log(fn, mediatype)
 801
 802             info = {'id': ID.encode('utf-8'),
 803                     'href': fn.encode('utf-8'),
 804                     'media-type': mediatype.encode('utf-8')}
 805             ebook.add_content(info, content)
 806
 807         #toc
 808         ncx = epub_utils.make_ncx(toc, metadata, filemap)
 809         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 810
 811         #spine
 812         for ID in spine:
 813             ebook.add_spine_item({'idref': ID})
 814
 815         #metadata -- no use of attributes (yet)
 816         # and fm: metadata disappears for now
 817         dcns = config.DCNS
 818         meta_info_items = []
 819         has_authors = False
 820         for k, v in metadata.iteritems():
 821             if k.startswith('fm:'):
 822                 continue
 823             meta_info_items.append({'item': dcns + k,
 824                                     'text': v}
 825                                    )
 826             if k == 'creator':
 827                 has_authors = True
 828
 829         if not has_authors and config.CLAIM_UNAUTHORED:
 830             meta_info_items.append({'item': dcns + 'creator',
 831                                     'text': 'The Contributors'})
 832
 833         #copyright
 834         authors = sorted(self.info['copyright'])
 835         for a in authors:
 836             meta_info_items.append({'item': dcns + 'contributor',
 837                                     'text': a}
 838                                    )
 839         if not has_authors:
 840             meta_info_items.append({'item': dcns + 'rights',
 841                                     'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 842                                    )
 843
 844         tree_str = ia_epub.make_opf(meta_info_items,
 845                                     ebook.manifest_items,
 846                                     ebook.spine_items,
 847                                     ebook.guide_items,
 848                                     ebook.cover_id)
 849         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 850         ebook.z.close()
 851
 852
 853     def publish_s3(self):
 854         """Push the book's epub to archive.org, using S3."""
 855         #XXX why only epub?
 856         secrets = {}
 857         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 858             fn = getattr(config, x)
 859             f = open(fn)
 860             secrets[x] = f.read().strip()
 861             f.close()
 862
 863         log(secrets)
 864         now = time.strftime('%F')
 865         s3output = self.filepath('s3-output.txt')
 866         s3url = 'http://s3.us.archive.org/booki-%s-%s/%s' % (self.project, self.book, self.bookname)
 867         detailsurl = 'http://archive.org/details/booki-%s-%s' % (self.project, self.book)
 868         headers = [
 869             'x-amz-auto-make-bucket:1',
 870             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
 871             'x-archive-meta-mediatype:texts',
 872             'x-archive-meta-collection:opensource',
 873             'x-archive-meta-title:%s' %(self.book,),
 874             'x-archive-meta-date:%s' % (now,),
 875             'x-archive-meta-creator:FLOSS Manuals Contributors',
 876             ]
 877
 878         if self.license in config.LICENSES:
 879             headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
 880
 881         argv = ['curl', '--location', '-s', '-o', s3output]
 882         for h in headers:
 883             argv.extend(('--header', h))
 884         argv.extend(('--upload-file', self.epubfile, s3url,))
 885
 886         log(' '.join(repr(x) for x in argv))
 887         check_call(argv, stdout=sys.stderr)
 888         return detailsurl
 889
 890     def publish_epub(self):
 891         self.epubfile = shift_file(self.epubfile, config.EPUB_DIR)
 892         return self.epubfile