objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 from subprocess import Popen, check_call, PIPE
  28 from cStringIO import StringIO
  29 import zipfile
  30 import traceback
  31 try:
  32     import simplejson as json
  33 except ImportError:
  34     import json
  35
  36 import lxml, lxml.html, lxml.etree
  37
  38 from objavi import config, twiki_wrapper, epub_utils
  39 from objavi.cgi_utils import log, run, shift_file
  40 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
  41
  42 from iarchive import epub as ia_epub
  43 from booki.xhtml_utils import EpubChapter
  44
  45 TMPDIR = os.path.abspath(config.TMPDIR)
  46 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  47 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  48 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  49
  50 def make_book_name(book, server, suffix='.pdf'):
  51     lang = config.SERVER_DEFAULTS.get(server, config.SERVER_DEFAULTS[config.DEFAULT_SERVER])['lang']
  52     book = ''.join(x for x in book if x.isalnum())
  53     return '%s-%s-%s%s' % (book, lang,
  54                            time.strftime('%Y.%m.%d-%H.%M.%S'),
  55                            suffix)
  56
  57 def _add_initial_number(e, n):
  58     """Put a styled chapter number n at the beginning of element e."""
  59     initial = e.makeelement("strong", Class="initial")
  60     e.insert(0, initial)
  61     initial.tail = ' '
  62     if e.text is not None:
  63         initial.tail += e.text
  64     e.text = ''
  65     initial.text = "%s." % n
  66
  67
  68 class TocItem(object):
  69     """This makes sense of the tuples from TOC.txt files"""
  70     def __init__(self, status, chapter, title):
  71         # status is
  72         #  0 - section heading with no chapter
  73         #  1 - chapter heading
  74         #  2 - book title
  75         #
  76         # chapter is twiki name of the chapter
  77         # title is a human readable name of the chapter.
  78         self.status = status
  79         self.chapter = chapter
  80         self.title = title
  81
  82     def is_chapter(self):
  83         return self.status == '1'
  84
  85     def is_section(self):
  86         return self.status == '0'
  87
  88     def is_title(self):
  89         return self.status == '2'
  90
  91     def __str__(self):
  92         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
  93
  94
  95 class Book(object):
  96     page_numbers = 'latin'
  97     preamble_page_numbers = 'roman'
  98
  99     def notify_watcher(self, message=None):
 100         if self.watcher:
 101             if  message is None:
 102                 #message is the name of the caller
 103                 message = traceback.extract_stack(None, 2)[0][2]
 104             log("notify_watcher called with '%s'" % message)
 105             self.watcher(message)
 106
 107     def __enter__(self):
 108         return self
 109
 110     def __exit__(self, exc_type, exc_value, traceback):
 111         self.cleanup()
 112         #could deal with exceptions here and return true
 113
 114     def __init__(self, book, server, bookname,
 115                  page_settings=None, watcher=None, isbn=None,
 116                  license=config.DEFAULT_LICENSE):
 117         log("*** Starting new book %s ***" % bookname)
 118         self.book = book
 119         self.server = server
 120         self.watcher = watcher
 121         self.isbn = isbn
 122         self.license = license
 123         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 124         os.chmod(self.workdir, 0755)
 125         defaults = config.SERVER_DEFAULTS[server]
 126         self.lang = defaults['lang']
 127         self.dir  = defaults['dir']
 128
 129         self.body_html_file = self.filepath('body.html')
 130         self.body_pdf_file = self.filepath('body.pdf')
 131         self.preamble_html_file = self.filepath('preamble.html')
 132         self.preamble_pdf_file = self.filepath('preamble.pdf')
 133         self.tail_html_file = self.filepath('tail.html')
 134         self.tail_pdf_file = self.filepath('tail.pdf')
 135         self.isbn_pdf_file = None
 136         self.pdf_file = self.filepath('final.pdf')
 137         self.body_odt_file = self.filepath('body.odt')
 138
 139         self.publish_name = bookname
 140         self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
 141         self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
 142
 143         if page_settings is not None:
 144             self.maker = PageSettings(**page_settings)
 145
 146         self.notify_watcher()
 147
 148     if config.TRY_BOOK_CLEANUP_ON_DEL:
 149         #Dont even define __del__ if it is not used.
 150         _try_cleanup_on_del = True
 151         def __del__(self):
 152             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 153                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 154                 self.cleanup()
 155
 156     def filepath(self, fn):
 157         return os.path.join(self.workdir, fn)
 158
 159     def save_data(self, fn, data):
 160         """Save without tripping up on unicode"""
 161         if isinstance(data, unicode):
 162             data = data.encode('utf8', 'ignore')
 163         f = open(fn, 'w')
 164         f.write(data)
 165         f.close()
 166
 167     def save_tempfile(self, fn, data):
 168         """Save the data in a temporary directory that will be cleaned
 169         up when all is done.  Return the absolute file path."""
 170         fn = self.filepath(fn)
 171         self.save_data(fn, data)
 172         return fn
 173
 174     def make_oo_doc(self):
 175         """Make an openoffice document, using the html2odt script."""
 176         self.wait_for_xvfb()
 177         html_text = lxml.etree.tostring(self.tree, method="html")
 178         self.save_data(self.body_html_file, html_text)
 179         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 180         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 181         os.rename(self.body_odt_file, self.publish_file)
 182         self.notify_watcher()
 183
 184     def extract_pdf_outline(self):
 185         self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
 186         for x in self.outline_contents:
 187             log(x)
 188         self.notify_watcher()
 189         return number_of_pages
 190
 191     def make_body_pdf(self):
 192         """Make a pdf of the HTML, using webkit"""
 193         #1. Save the html
 194         html_text = lxml.etree.tostring(self.tree, method="html")
 195         self.save_data(self.body_html_file, html_text)
 196
 197         #2. Make a pdf of it
 198         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
 199         self.notify_watcher('generate_pdf')
 200
 201         n_pages = self.extract_pdf_outline()
 202
 203         log ("found %s pages in pdf" % n_pages)
 204         #4. resize pages, shift gutters, even pages
 205         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 206         self.notify_watcher('reshape_pdf')
 207
 208         #5 add page numbers
 209         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 210                               numbers=self.page_numbers)
 211         self.notify_watcher("number_pdf")
 212         self.notify_watcher()
 213
 214     def make_preamble_pdf(self):
 215         contents = self.make_contents()
 216         inside_cover_html = self.compose_inside_cover()
 217         html = ('<html dir="%s"><head>\n'
 218                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 219                 '<link rel="stylesheet" href="%s" />\n'
 220                 '</head>\n<body>\n'
 221                 '<h1 class="frontpage">%s</h1>'
 222                 '%s\n'
 223                 '<div class="contents">%s</div>\n'
 224                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 225                 '<!--%s--></div></body></html>'
 226                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 227                      contents, self.title)
 228         self.save_data(self.preamble_html_file, html)
 229
 230         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 231
 232         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 233
 234         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 235                             numbers=self.preamble_page_numbers,
 236                             number_start=-2)
 237
 238         self.notify_watcher()
 239
 240     def make_end_matter_pdf(self):
 241         """Make an inside back cover and a back cover.  If there is an
 242         isbn number its barcode will be put on the back cover."""
 243         if self.isbn:
 244             self.isbn_pdf_file = self.filepath('isbn.pdf')
 245             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 246             self.notify_watcher('make_barcode_pdf')
 247
 248         self.save_data(self.tail_html_file, self.compose_end_matter())
 249         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 250
 251         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 252                                centre_end=True, even_pages=False)
 253         self.notify_watcher()
 254
 255     def make_book_pdf(self):
 256         """A convenient wrapper of a few necessary steps"""
 257         # now the Xvfb server is needed. make sure it has had long enough to get going
 258         self.wait_for_xvfb()
 259         self.make_body_pdf()
 260         self.make_preamble_pdf()
 261         self.make_end_matter_pdf()
 262
 263         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 264                     self.body_pdf_file, self.tail_pdf_file,
 265                     self.isbn_pdf_file)
 266
 267         self.notify_watcher('concatenated_pdfs')
 268
 269
 270     def make_simple_pdf(self, mode):
 271         """Make a simple pdf document without contents or separate
 272         title page.  This is used for multicolumn newspapers and for
 273         web-destined pdfs."""
 274         self.wait_for_xvfb()
 275         #0. Add heading to begining of html
 276         body = list(self.tree.cssselect('body'))[0]
 277         e = body.makeelement('h1', {'id': 'book-title'})
 278         e.text = self.title
 279         body.insert(0, e)
 280         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 281         e.addnext(intro)
 282
 283         #0.5 adjust parameters to suit the particular kind of output
 284         if mode == 'web':
 285             self.maker.gutter = 0
 286
 287         #1. Save the html
 288         html_text = lxml.etree.tostring(self.tree, method="html")
 289         self.save_data(self.body_html_file, html_text)
 290
 291         #2. Make a pdf of it (direct to to final pdf)
 292         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
 293         self.notify_watcher('generate_pdf')
 294         n_pages = count_pdf_pages(self.pdf_file)
 295
 296         if mode != 'web':
 297             #3. resize pages and shift gutters.
 298             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 299             self.notify_watcher('reshape_pdf')
 300
 301             #4. add page numbers
 302             self.maker.number_pdf(self.pdf_file, n_pages,
 303                                   dir=self.dir, numbers=self.page_numbers)
 304             self.notify_watcher("number_pdf")
 305         self.notify_watcher()
 306
 307
 308     def rotate180(self):
 309         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 310         presses."""
 311         rotated = self.filepath('final-rotate.pdf')
 312         unrotated = self.filepath('final-pre-rotate.pdf')
 313         #leave the unrotated pdf intact at first, in case of error.
 314         rotate_pdf(self.pdf_file, rotated)
 315         os.rename(self.pdf_file, unrotated)
 316         os.rename(rotated, self.pdf_file)
 317         self.notify_watcher()
 318
 319     def publish_pdf(self):
 320         """Move the finished PDF to its final resting place"""
 321         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 322         os.rename(self.pdf_file, self.publish_file)
 323         self.notify_watcher()
 324
 325     def get_twiki_metadata(self):
 326         """Get information about a twiki book (as much as is easy and useful)."""
 327         if not hasattr(self, 'toc'):
 328             self.load_toc()
 329
 330         title_map = {}
 331         authors = {}
 332         meta = {
 333             'language': self.lang,
 334             'identifier': 'http://%s/epub/%s/%s' %(self.server, self.book, time.strftime('%Y.%m.%d-%H.%M.%S')),
 335             'publisher': 'FLOSS Manuals http://flossmanuals.net',
 336             'date': time.strftime('%Y-%m-%d'),
 337             'fm:server': self.server,
 338             'fm:book': self.book,
 339             'title': self.book,
 340             }
 341         spine = []
 342         toc = []
 343         section = toc
 344         for t in self.toc:
 345             if t.is_chapter():
 346                 spine.append(t.chapter)
 347                 section.append((t.title, t.chapter))
 348                 title_map[t.title] = t.chapter
 349             elif t.is_section():
 350                 section = []
 351                 toc.append([[t.title, None], section])
 352             elif t.is_title():
 353                 meta['title'] = t.title
 354
 355         author_copyright, chapter_copyright = twiki_wrapper.get_book_copyright(self.server, self.book, title_map)
 356
 357         return {
 358             'metadata': meta,
 359             'TOC': toc,
 360             'spine': spine,
 361             'copyright': author_copyright,
 362             #'chapter_copyright': chapter_copyright,
 363         }
 364
 365     def load_toc(self):
 366         """From the TOC.txt file create a list of TocItems with
 367         the attributes <status>, <chapter>, and <title>.
 368
 369         <status> is a number, with the following meaning:
 370
 371               0 - section heading with no chapter
 372               1 - chapter heading
 373               2 - book title
 374
 375         The TocItem object has convenience functions <is_chapter> and
 376         <is_section>.
 377
 378         <chapter> is twiki name of the chapter.
 379
 380         <title> is a human readable title for the chapter.  It is likely to
 381         differ from the title given in the chapter's <h1> heading.
 382         """
 383         self.toc = []
 384         for status, chapter, title in twiki_wrapper.toc_iterator(self.server, self.book):
 385             self.toc.append(TocItem(status, chapter, title))
 386         self.notify_watcher()
 387
 388     def load_book(self):
 389         """Fetch and parse the raw html of the book.  Links in the
 390         document will be made absolute."""
 391         html = twiki_wrapper.get_book_html(self.server, self.book, self.dir)
 392         self.save_tempfile('raw.html', html)
 393
 394         self.tree = lxml.html.document_fromstring(html)
 395         self.tree.make_links_absolute(config.BOOK_URL % (self.server, self.book))
 396         self.headings = [x for x in self.tree.cssselect('h1')]
 397         if self.headings:
 398             self.headings[0].set('class', "first-heading")
 399         for h1 in self.headings:
 400             h1.title = h1.text_content().strip()
 401         self.notify_watcher()
 402
 403     def load(self):
 404         """Wrapper around all necessary load methods."""
 405         self.load_book()
 406         self.load_toc()
 407
 408     def make_contents(self):
 409         """Generate HTML containing the table of contents.  This can
 410         only be done after the main PDF has been made."""
 411         header = '<h1>Table of Contents</h1><table class="toc">\n'
 412         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 413                     '<td class="pagenumber">%s</td></tr>\n')
 414         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 415         footer = '\n</table>'
 416
 417         contents = []
 418
 419         chapter = 1
 420         page_num = 1
 421         subsections = [] # for the subsection heading pages.
 422
 423         outline_contents = iter(self.outline_contents)
 424         headings = iter(self.headings)
 425
 426         for t in self.toc:
 427             if t.is_chapter():
 428                 try:
 429                     h1 = headings.next()
 430                 except StopIteration:
 431                     log("heading not found for %s (previous h1 missing?). Stopping" % t)
 432                     break
 433                 h1_text, level, page_num = outline_contents.next()
 434                 log("%r %r" % (h1.title, h1_text))
 435                 contents.append(row_tmpl % (chapter, h1.title, page_num))
 436                 chapter += 1
 437             elif t.is_section():
 438                 contents.append(section_tmpl % t.title)
 439             else:
 440                 log("mystery TOC item: %s" % t)
 441
 442         doc = header + '\n'.join(contents) + footer
 443         self.notify_watcher()
 444         return doc
 445
 446     def add_section_titles(self):
 447         """Add any section heading pages that the TOC.txt file
 448         specifies.  These are sub-book, super-chapter groupings.
 449
 450         Also add initial numbers to chapters.
 451         """
 452         headings = iter(self.headings)
 453         chapter = 1
 454         section = None
 455
 456         for t in self.toc:
 457             if t.is_chapter() and section is not None:
 458                 try:
 459                     h1 = headings.next()
 460                 except StopIteration:
 461                     log("heading not found for %s (previous h1 missing?)" % t)
 462                     break
 463                 item = h1.makeelement('div', Class='chapter')
 464                 log(h1.title, debug='HTMLGEN')
 465                 item.text = h1.title
 466                 _add_initial_number(item, chapter)
 467
 468                 section.append(item)
 469
 470                 if not section_placed:
 471                     log("placing section", debug='HTMLGEN')
 472                     h1.addprevious(section)
 473                     section_placed = True
 474                 else:
 475                     log("NOT placing section", debug='HTMLGEN')
 476
 477                 #put a bold number at the beginning of the h1.
 478                 _add_initial_number(h1, chapter)
 479                 chapter += 1
 480
 481             elif t.is_section():
 482                 section = self.tree.makeelement('div', Class="subsection")
 483                 # section Element complains when you try to ask it whether it
 484                 # has been placed (though it does know)
 485                 section_placed = False
 486                 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
 487                 heading.set("Class", "subsection-heading")
 488                 section.append(heading)
 489
 490         self.notify_watcher()
 491
 492
 493     def add_css(self, css=None, mode='book'):
 494         """If css looks like a url, use it as a stylesheet link.
 495         Otherwise it is the CSS itself, which is saved to a temporary file
 496         and linked to."""
 497         log("css is %r" % css)
 498         htmltree = self.tree
 499         if css is None or not css.strip():
 500             defaults = config.SERVER_DEFAULTS[self.server]
 501             url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
 502         elif not re.match(r'^http://\S+$', css):
 503             fn = self.save_tempfile('objavi.css', css)
 504             url = 'file://' + fn
 505         else:
 506             url = css
 507         #XXX for debugging and perhaps sensible anyway
 508         #url = url.replace('file:///home/douglas/objavi2', '')
 509
 510
 511         #find the head -- it's probably first child but lets not assume.
 512         for child in htmltree:
 513             if child.tag == 'head':
 514                 head = child
 515                 break
 516         else:
 517             head = htmltree.makeelement('head')
 518             htmltree.insert(0, head)
 519
 520         link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 521         self.css_url = url
 522         self.notify_watcher()
 523         return url
 524
 525     def set_title(self, title=None):
 526         """If a string is supplied, it becomes the book's title.
 527         Otherwise a guess is made."""
 528         if title:
 529             self.title = title
 530         else:
 531             titles = [x.text_content() for x in self.tree.cssselect('title')]
 532             if titles and titles[0]:
 533                 self.title = titles[0]
 534             else:
 535                 #oh well
 536                 self.title = 'A Manual About ' + self.book
 537         return self.title
 538
 539     def _read_localised_template(self, template, fallbacks=['en']):
 540         """Try to get the template in the approriate language, otherwise in english."""
 541         for lang in [self.lang] + fallbacks:
 542             try:
 543                 fn = template % (lang)
 544                 f = open(fn)
 545                 break
 546             except IOError, e:
 547                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 548                 log(e)
 549         template = f.read()
 550         f.close()
 551         return template
 552
 553     def compose_inside_cover(self):
 554         """create the markup for the preamble inside cover."""
 555         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 556
 557         if self.isbn:
 558             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 559         else:
 560             isbn_text = ''
 561
 562         return template % {'date': time.strftime('%Y-%m-%d'),
 563                            'isbn': isbn_text,
 564                            'license': self.license,
 565                            }
 566
 567
 568     def compose_end_matter(self):
 569         """create the markup for the end_matter inside cover.  If
 570         self.isbn is not set, the html will result in a pdf that
 571         spills onto two pages.
 572         """
 573         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 574
 575         d = {'css_url': self.css_url,
 576              'title': self.title
 577              }
 578
 579         if self.isbn:
 580             d['inside_cover_style'] = ''
 581         else:
 582             d['inside_cover_style'] = 'page-break-after: always'
 583
 584         return template % d
 585
 586
 587
 588
 589     def spawn_x(self):
 590         """Start an Xvfb instance, using a new server number.  A
 591         reference to it is stored in self.xvfb, which is used to kill
 592         it when the pdf is done.
 593
 594         Note that Xvfb doesn't interact well with dbus which is
 595         present on modern desktops.
 596         """
 597         #Find an unused server number (in case two cgis are running at once)
 598         while True:
 599             servernum = random.randrange(50, 500)
 600             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 601                 break
 602
 603         self.xserver_no = ':%s' % servernum
 604
 605         authfile = self.filepath('Xauthority')
 606         os.environ['XAUTHORITY'] = authfile
 607
 608         #mcookie(1) eats into /dev/random, so avoid that
 609         from hashlib import md5
 610         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 611         mcookie = m.hexdigest()
 612
 613         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 614
 615         self.xvfb = Popen(['Xvfb', self.xserver_no,
 616                            '-screen', '0', '1024x768x24',
 617                            '-pixdepths', '32',
 618                            #'-blackpixel', '0',
 619                            #'-whitepixel', str(2 ** 24 -1),
 620                            #'+extension', 'Composite',
 621                            '-dpi', '96',
 622                            '-kb',
 623                            '-nolisten', 'tcp',
 624                            ])
 625
 626         # We need to wait a bit before the Xvfb is ready.  but the
 627         # downloads are so slow that that probably doesn't matter
 628
 629         self.xvfb_ready_time = time.time() + 2
 630
 631         os.environ['DISPLAY'] = self.xserver_no
 632         log(self.xserver_no)
 633
 634     def wait_for_xvfb(self):
 635         """wait until a previously set time before continuing.  This
 636         is so Xvfb has time to properly start."""
 637         if hasattr(self, 'xvfb'):
 638             d = self.xvfb_ready_time - time.time()
 639             if d > 0:
 640                 time.sleep(d)
 641                 self.notify_watcher()
 642
 643     def cleanup_x(self):
 644         """Try very hard to kill off Xvfb.  In addition to killing
 645         this instance's xvfb, occasionally (randomly) search for
 646         escaped Xvfb instances and kill those too."""
 647         if not hasattr(self, 'xvfb'):
 648             return
 649         check_call(['xauth', 'remove', self.xserver_no])
 650         p = self.xvfb
 651         log("trying to kill Xvfb %s" % p.pid)
 652         os.kill(p.pid, 15)
 653         for i in range(10):
 654             if p.poll() is not None:
 655                 log("%s died with %s" % (p.pid, p.poll()))
 656                 break
 657             log("%s not dead yet" % p.pid)
 658             time.sleep(0.2)
 659         else:
 660             log("Xvfb would not die! kill -9! kill -9!")
 661             os.kill(p.pid, 9)
 662
 663         if random.random() < 0.1:
 664             # occasionally kill old xvfbs and soffices, if there are any.
 665             self.kill_old_processes()
 666
 667     def kill_old_processes(self):
 668         """Sometimes, despite everything, Xvfb or soffice instances
 669         hang around well after they are wanted -- for example if the
 670         cgi process dies particularly badly. So kill them if they have
 671         been running for a long time."""
 672         log("running kill_old_processes")
 673         p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
 674                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 675         data = p.communicate()[0].strip()
 676         if data:
 677             lines = data.split('\n')
 678             for line in lines:
 679                 log('dealing with ps output "%s"' % line)
 680                 try:
 681                     pid, days, hours, minutes, seconds \
 682                          = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
 683                 except AttributeError:
 684                     log("Couldn't parse that line!")
 685                 # 50 minutes should be enough xvfb time for anyone
 686                 if days or hours or int(minutes) > 50:
 687                     log("going to kill pid %s" % pid)
 688                     os.kill(int(pid), 15)
 689                     time.sleep(0.5)
 690                     try:
 691                         os.kill(int(pid), 9)
 692                         log('killing %s with -9')
 693                     except OSError, e:
 694                         pass
 695         self.notify_watcher()
 696
 697     def cleanup(self):
 698         self.cleanup_x()
 699         if not config.KEEP_TEMP_FILES:
 700             for fn in os.listdir(self.workdir):
 701                 os.remove(os.path.join(self.workdir, fn))
 702             os.rmdir(self.workdir)
 703         else:
 704             log("NOT removing '%s', containing the following files:" % self.workdir)
 705             log(*os.listdir(self.workdir))
 706
 707         self.notify_watcher()
 708
 709
 710
 711 def fetch_zip(server, book, project):
 712     from urllib2 import urlopen
 713     settings = config.SERVER_DEFAULTS[server]
 714     interface = settings['interface']
 715     if interface == 'Booki':
 716         url = config.BOOKI_ZIP_URL  % {'server': server, 'project': project, 'book':book}
 717         f = urlopen(url)
 718     elif interface == 'TWiki':
 719         url = config.TWIKI_GATEWAY_URL % (HTTP_HOST, server, book)
 720         f = urlopen(url)
 721     elif interface == 'local':
 722         f = open('%s/%s.zip' % (config.BOOKI_BOOK_DIR, book))
 723     else:
 724         raise NotImplementedError("Can't handle '%s' interface" % interface)
 725     if hasattr(f, 'geturl'):
 726         log(f.geturl())
 727     blob = f.read()
 728     f.close()
 729     return blob
 730
 731 class ZipBook(Book):
 732     """A Book based on a booki-zip file.  Depending how out-of-date
 733     this docstring is, some of the parent's methods will not work.
 734     """
 735     def __init__(self, server, book, project=None, **kwargs):
 736         blob = fetch_zip(server, book, project)
 737         f = StringIO(blob)
 738         self.store = zipfile.ZipFile(f, 'r')
 739         self.info = json.loads(self.store.read('info.json'))
 740         metadata = self.info['metadata']
 741
 742         if server == config.LOCALHOST:
 743             server = metadata.get('fm:server', server)
 744             book = metadata.get('fm:book', book)
 745
 746         bookname = make_book_name(book, server)
 747
 748         Book.__init__(self, book, server, bookname, **kwargs)
 749         self.set_title(metadata['title'])
 750         self.project = project
 751         self.epubfile = self.filepath('%s.epub' % self.book)
 752
 753     def make_epub(self, use_cache=False):
 754         """Make an epub version of the book, using Mike McCabe's
 755         epub module for the Internet Archive."""
 756         ebook = ia_epub.Book(self.epubfile, content_dir='')
 757         manifest = self.info['manifest']
 758         metadata = self.info['metadata']
 759         toc = self.info['TOC']
 760         spine = self.info['spine']
 761
 762         #manifest
 763         filemap = {} #reformulated manifest for NCX
 764         for ID in manifest:
 765             fn, mediatype = manifest[ID]
 766             oldfn = fn
 767             log(ID, fn, mediatype)
 768             content = self.store.read(fn)
 769             if mediatype == 'text/html':
 770                 log('CONVERTING')
 771                 #convert to application/xhtml+xml
 772                 c = EpubChapter(self.server, self.book, ID, content,
 773                                 use_cache=use_cache)
 774                 c.remove_bad_tags()
 775                 c.prepare_for_epub()
 776                 content = c.as_xhtml()
 777                 fn = fn[:-5] + '.xhtml'
 778                 mediatype = 'application/xhtml+xml'
 779             if mediatype == 'application/xhtml+xml':
 780                 filemap[oldfn] = fn
 781                 #log(fn, mediatype)
 782
 783             info = {'id': ID.encode('utf-8'),
 784                     'href': fn.encode('utf-8'),
 785                     'media-type': mediatype.encode('utf-8')}
 786             ebook.add_content(info, content)
 787
 788         #toc
 789         ncx = epub_utils.make_ncx(toc, metadata, filemap)
 790         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 791
 792         #spine
 793         for ID in spine:
 794             ebook.add_spine_item({'idref': ID})
 795
 796         #metadata -- no use of attributes (yet)
 797         # and fm: metadata disappears for now
 798         dcns = config.DCNS
 799         meta_info_items = [{'item': dcns + 'creator',
 800                             'text': 'The Contributors'}
 801                            ]
 802         for k, v in metadata.iteritems():
 803             if k.startswith('fm:'):
 804                 continue
 805             meta_info_items.append({'item': dcns + k,
 806                                     'text': v}
 807                                    )
 808
 809         #copyright
 810         authors = sorted(self.info['copyright'])
 811         for a in authors:
 812             meta_info_items.append({'item': dcns + 'contributor',
 813                                     'text': a}
 814                                    )
 815         meta_info_items.append({'item': dcns + 'rights',
 816                                 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 817                                )
 818
 819         tree_str = ia_epub.make_opf(meta_info_items,
 820                                     ebook.manifest_items,
 821                                     ebook.spine_items,
 822                                     ebook.guide_items,
 823                                     ebook.cover_id)
 824         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 825         ebook.z.close()
 826
 827
 828     def publish_s3(self):
 829         """Push the book's epub to archive.org, using S3."""
 830         #XXX why only epub?
 831         secrets = {}
 832         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 833             fn = getattr(config, x)
 834             f = open(fn)
 835             secrets[x] = f.read().strip()
 836             f.close()
 837
 838         log(secrets)
 839         now = time.strftime('%F')
 840         s3url = 'http://s3.us.archive.org/booki-%s-%s/%s-%s.epub' % (self.project, self.book, self.book, now)
 841         detailsurl = 'http://archive.org/details/booki-%s-%s' % (self.project, self.book)
 842         headers = [
 843             'x-amz-auto-make-bucket:1',
 844             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
 845             'x-archive-meta-mediatype:texts',
 846             'x-archive-meta-collection:opensource',
 847             'x-archive-meta-title:%s' %(self.book,),
 848             'x-archive-meta-date:%s' % (now,),
 849             'x-archive-meta-creator:FLOSS Manuals Contributors',
 850             ]
 851
 852         if self.license in config.LICENSES:
 853             headers.append('x-archive-meta-licenseurl:%s' % config.licenses[self.license])
 854
 855         argv = ['curl', '--location',]
 856         for h in headers:
 857             argv.extend(('--header', h))
 858         argv.extend(('--upload-file', self.epubfile, s3url,))
 859
 860         log(argv)
 861         check_call(argv)
 862         return detailsurl
 863
 864     def publish_epub(self):
 865         self.epubfile = shift_file(self.epubfile, config.EPUB_DIR)
 866         return self.epubfile