objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 from subprocess import Popen, check_call, PIPE
  28 from cStringIO import StringIO
  29 from urllib2 import urlopen
  30 import zipfile
  31 import traceback
  32 from string import ascii_letters
  33 from pprint import pformat
  34
  35 try:
  36     import simplejson as json
  37 except ImportError:
  38     import json
  39
  40 import lxml, lxml.html
  41 from lxml import etree
  42
  43 from objavi import config, epub_utils
  44 from objavi.cgi_utils import log, run, shift_file, make_book_name, guess_lang, guess_text_dir
  45 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
  46 from objavi.epub import add_guts, _find_tag
  47
  48 from iarchive import epub as ia_epub
  49 from booki.xhtml_utils import EpubChapter
  50 from booki.bookizip import get_metadata, add_metadata, clear_metadata, get_metadata_schemes
  51
  52 TMPDIR = os.path.abspath(config.TMPDIR)
  53 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  54 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  55 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  56
  57
  58 def _get_best_title(tocpoint):
  59     if 'html_title' in tocpoint:
  60         return tocpoint['html_title']
  61     if 'title' in tocpoint:
  62         return tocpoint['title']
  63     return 'Untitled'
  64
  65
  66 def _add_initial_number(e, n):
  67     """Put a styled chapter number n at the beginning of element e."""
  68     initial = e.makeelement("strong", Class="initial")
  69     e.insert(0, initial)
  70     initial.tail = ' '
  71     if e.text is not None:
  72         initial.tail += e.text
  73     e.text = ''
  74     initial.text = "%s." % n
  75
  76 def expand_toc(toc, depth=1, index=0):
  77     """Reformat toc slightly for convenience"""
  78     for item in toc:
  79         url = item['url'].lstrip('/')
  80         bits = url.split('#', 1)
  81         filename = bits[0]
  82         fragment = (bits[1] if len(bits) == 2 else None)
  83         item['depth'] = depth
  84         item["filename"] = filename
  85         item["fragment"] = fragment
  86         item["index"] = index
  87         index += 1
  88         if 'children' in item:
  89             index = expand_toc(item['children'], depth + 1, index)
  90     return index
  91
  92 def _serialise(rtoc, stoc, depth):
  93     for item in rtoc:
  94         url = item['url'].lstrip('/')
  95         bits = url.split('#', 1)
  96         filename = bits[0]
  97         fragment = (bits[1] if len(bits) == 2 else None)
  98         stoc.append({"depth": depth,
  99                      "title": item['title'],
 100                      "url": url,
 101                      "filename": filename,
 102                      "fragment": fragment,
 103                      "type": item['type']
 104                      })
 105         if 'children' in item:
 106             _serialise(item['children'], stoc, depth + 1)
 107
 108
 109 def serialise_toc(rtoc):
 110     """Take the recursive TOC structure and turn it into a list of
 111     serial points.  Reformat some things for convenience."""
 112     stoc = []
 113     _serialise(rtoc, stoc, 1)
 114     for i, x in enumerate(stoc):
 115         x['position'] = i
 116     return stoc
 117
 118 def filename_toc_map(rtoc):
 119     tocmap = {}
 120     log(rtoc)
 121     def traverse(toc):
 122         for point in toc:
 123             log(point.keys())
 124             tocmap.setdefault(point['filename'], []).append(point)
 125             if 'children' in point:
 126                 traverse(point['children'])
 127     traverse(rtoc)
 128     return tocmap
 129
 130
 131 class Book(object):
 132     page_numbers = 'latin'
 133     preamble_page_numbers = 'roman'
 134
 135     def notify_watcher(self, message=None):
 136         if self.watcher:
 137             if  message is None:
 138                 #message is the name of the caller
 139                 message = traceback.extract_stack(None, 2)[0][2]
 140             log("notify_watcher called with '%s'" % message)
 141             self.watcher(message)
 142
 143     def __enter__(self):
 144         return self
 145
 146     def __exit__(self, exc_type, exc_value, traceback):
 147         self.cleanup()
 148         #could deal with exceptions here and return true
 149
 150
 151     def __init__(self, book, server, bookname, project=None,
 152                  page_settings=None, watcher=None, isbn=None,
 153                  license=config.DEFAULT_LICENSE, title=None,
 154                  max_age=0):
 155         log("*** Starting new book %s ***" % bookname,
 156             "starting zipbook with", server, book, project)
 157         self.watcher = watcher
 158         self.notify_watcher('start')
 159         self.bookname = bookname
 160         self.book = book
 161         self.server = server
 162         self.project = project
 163         self.cookie = ''.join(random.sample(ascii_letters, 10))
 164         try:
 165             blob = fetch_zip(server, book, project, save=True, max_age=max_age)
 166         except HTTPError, e:
 167             #log(e.url)
 168             traceback.print_exc()
 169             self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
 170             #not much to do?
 171             sys.exit()
 172         f = StringIO(blob)
 173         self.notify_watcher('fetch_zip')
 174         self.store = zipfile.ZipFile(f, 'r')
 175         self.info = json.loads(self.store.read('info.json'))
 176         for k in ('manifest', 'metadata', 'spine', 'TOC'):
 177             if k not in self.info:
 178                 raise ObjaviError('info.json of %s lacks vital element "%s"' %
 179                                   (bookname, k))
 180             #check types also?
 181
 182         self.metadata = self.info['metadata']
 183         self.spine = self.info['spine']
 184         self.manifest = self.info['manifest']
 185
 186         if server == config.LOCALHOST: # [DEPRECATED]
 187             server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
 188             book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
 189
 190         log(pformat(self.metadata))
 191         self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
 192         if not self.lang:
 193             self.lang = guess_lang(server, book)
 194             log('guessed lang as %s' % self.lang)
 195
 196         self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
 197         if not self.toc_header:
 198             self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
 199
 200         self.dir = get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0]
 201         if not self.dir:
 202             self.dir = guess_text_dir(server, book)
 203
 204
 205         #Patch in the extra metadata. (lang and dir may be set from config)
 206         #these should be read from zip -- so should go into zip?
 207         for var, key, scheme, ns in (
 208             (isbn, 'id', 'ISBN', config.DC),
 209             (license, 'rights', 'License', config.DC),
 210             (title, 'title', '', config.DC),
 211             (self.lang, 'language', '', config.DC),
 212             (self.dir, 'dir', '', config.FM),
 213             ):
 214             if var is not None:
 215                 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
 216
 217         self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
 218         self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
 219
 220         self.toc = self.info['TOC']
 221         expand_toc(self.toc)
 222
 223         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 224         os.chmod(self.workdir, 0755)
 225
 226         self.body_html_file = self.filepath('body.html')
 227         self.body_pdf_file = self.filepath('body.pdf')
 228         self.preamble_html_file = self.filepath('preamble.html')
 229         self.preamble_pdf_file = self.filepath('preamble.pdf')
 230         self.tail_html_file = self.filepath('tail.html')
 231         self.tail_pdf_file = self.filepath('tail.pdf')
 232         self.isbn_pdf_file = None
 233         self.pdf_file = self.filepath('final.pdf')
 234         self.body_odt_file = self.filepath('body.odt')
 235
 236         self.publish_file = os.path.join(PUBLISH_PATH, bookname)
 237         self.publish_url = os.path.join(config.PUBLISH_URL, bookname)
 238
 239         if page_settings is not None:
 240             self.maker = PageSettings(**page_settings)
 241
 242         titles = get_metadata(self.metadata, 'title')
 243         if titles:
 244             self.title = titles[0]
 245         else:
 246             self.title = 'A Manual About ' + self.book
 247
 248         self.notify_watcher()
 249
 250
 251     if config.TRY_BOOK_CLEANUP_ON_DEL:
 252         #Dont even define __del__ if it is not used.
 253         _try_cleanup_on_del = True
 254         def __del__(self):
 255             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 256                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 257                 self.cleanup()
 258
 259     def get_tree_by_id(self, id):
 260         """get an HTML tree from the given manifest ID"""
 261         name = self.manifest[id]['url']
 262         mimetype = self.manifest[id]['mimetype']
 263         s = self.store.read(name)
 264         f = StringIO(s)
 265         if mimetype == 'text/html':
 266             try:
 267                 tree = lxml.html.parse(f)
 268             except etree.XMLSyntaxError, e:
 269                 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
 270                     (id, name, s[:20], e))
 271                 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 272         elif 'xml' in mimetype: #XXX or is this just asking for trouble?
 273             tree = etree.parse(f)
 274         else:
 275             tree = f.read()
 276         f.close()
 277         return tree
 278
 279     def filepath(self, fn):
 280         return os.path.join(self.workdir, fn)
 281
 282     def save_data(self, fn, data):
 283         """Save without tripping up on unicode"""
 284         if isinstance(data, unicode):
 285             data = data.encode('utf8', 'ignore')
 286         f = open(fn, 'w')
 287         f.write(data)
 288         f.close()
 289
 290     def save_tempfile(self, fn, data):
 291         """Save the data in a temporary directory that will be cleaned
 292         up when all is done.  Return the absolute file path."""
 293         fn = self.filepath(fn)
 294         self.save_data(fn, data)
 295         return fn
 296
 297     def make_oo_doc(self):
 298         """Make an openoffice document, using the html2odt script."""
 299         self.wait_for_xvfb()
 300         html_text = etree.tostring(self.tree, method="html")
 301         self.save_data(self.body_html_file, html_text)
 302         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 303         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 304         os.rename(self.body_odt_file, self.publish_file)
 305         self.notify_watcher()
 306
 307     def extract_pdf_outline(self):
 308         #self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
 309         debugf = self.filepath('outline.txt')
 310         self.outline_contents, self.outline_text, number_of_pages = \
 311                 parse_outline(self.body_pdf_file, 1, debugf)
 312
 313         if not self.outline_contents:
 314             #probably problems with international text. need a horrible hack
 315             log('no outline: trying again with ascii headings')
 316             import copy
 317             tree = copy.deepcopy(self.tree)
 318             titlemap = {}
 319             for tag in ('h1', 'h2', 'h3', 'h4'):
 320                 for i, e in enumerate(tree.getiterator(tag)):
 321                     key = "%s_%s" % (tag, i)
 322                     titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
 323                     del e[:]
 324                     if tag == 'h1':
 325                         e = lxml.etree.SubElement(e, "strong", Class="initial")
 326                     e.text = key
 327                     log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
 328
 329             ascii_html_file = self.filepath('body-ascii-headings.html')
 330             ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
 331             html_text = lxml.etree.tostring(tree, method="html")
 332             self.save_data(ascii_html_file, html_text)
 333             self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
 334             debugf = self.filepath('ascii_outline.txt')
 335             ascii_contents, ascii_text, number_of_ascii_pages = \
 336                 parse_outline(ascii_pdf_file, 1, debugf)
 337             self.outline_contents = []
 338             log ("number of pages: %s, post ascii: %s" %
 339                  (number_of_pages, number_of_ascii_pages))
 340             for ascii_title, depth, pageno in ascii_contents:
 341                 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
 342                     ascii_title = ascii_title[:-4]
 343                 if ' ' in ascii_title:
 344                     ascii_title = ascii_title.rsplit(' ', 1)[1]
 345                 title = titlemap.get(ascii_title, '')
 346                 log((ascii_title, title, depth, pageno))
 347
 348                 self.outline_contents.append((title, depth, pageno))
 349         else:
 350             for x in self.outline_contents:
 351                 log(x)
 352
 353         self.notify_watcher()
 354         return number_of_pages
 355
 356     def make_body_pdf(self):
 357         """Make a pdf of the HTML, using webkit"""
 358         #1. Save the html
 359         html_text = etree.tostring(self.tree, method="html")
 360         self.save_data(self.body_html_file, html_text)
 361
 362         #2. Make a pdf of it
 363         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
 364         self.notify_watcher('generate_pdf')
 365
 366         n_pages = self.extract_pdf_outline()
 367
 368         log ("found %s pages in pdf" % n_pages)
 369         #4. resize pages, shift gutters, even pages
 370         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 371         self.notify_watcher('reshape_pdf')
 372
 373         #5 add page numbers
 374         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 375                               numbers=self.page_numbers)
 376         self.notify_watcher("number_pdf")
 377         self.notify_watcher()
 378
 379     def make_preamble_pdf(self):
 380         contents = self.make_contents()
 381         inside_cover_html = self.compose_inside_cover()
 382         log(self.dir, self.css_url, self.title, inside_cover_html,
 383             self.toc_header, contents, self.title)
 384
 385         html = ('<html dir="%s"><head>\n'
 386                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 387                 '<link rel="stylesheet" href="%s" />\n'
 388                 '</head>\n<body>\n'
 389                 '<h1 class="frontpage">%s</h1>'
 390                 '%s\n'
 391                 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
 392                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 393                 '<!--%s--></div></body></html>'
 394                 ) % (self.dir, self.css_url, self.title, inside_cover_html.decode('utf-8'),
 395                      self.toc_header, contents, self.title)
 396         self.save_data(self.preamble_html_file, html)
 397
 398         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 399
 400         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 401
 402         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 403                             numbers=self.preamble_page_numbers,
 404                             number_start=-2)
 405
 406         self.notify_watcher()
 407
 408     def make_end_matter_pdf(self):
 409         """Make an inside back cover and a back cover.  If there is an
 410         isbn number its barcode will be put on the back cover."""
 411         if self.isbn:
 412             self.isbn_pdf_file = self.filepath('isbn.pdf')
 413             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 414             self.notify_watcher('make_barcode_pdf')
 415
 416         end_matter = self.compose_end_matter()
 417         log(end_matter)
 418         self.save_data(self.tail_html_file, end_matter.decode('utf-8'))
 419         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 420
 421         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 422                                centre_end=True, even_pages=False)
 423         self.notify_watcher()
 424
 425     def make_book_pdf(self):
 426         """A convenient wrapper of a few necessary steps"""
 427         # now the Xvfb server is needed. make sure it has had long enough to get going
 428         self.wait_for_xvfb()
 429         self.make_body_pdf()
 430         self.make_preamble_pdf()
 431         self.make_end_matter_pdf()
 432
 433         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 434                     self.body_pdf_file, self.tail_pdf_file,
 435                     self.isbn_pdf_file)
 436
 437         self.notify_watcher('concatenated_pdfs')
 438
 439
 440     def make_simple_pdf(self, mode):
 441         """Make a simple pdf document without contents or separate
 442         title page.  This is used for multicolumn newspapers and for
 443         web-destined pdfs."""
 444         self.wait_for_xvfb()
 445         #0. Add heading to begining of html
 446         body = list(self.tree.cssselect('body'))[0]
 447         e = body.makeelement('h1', {'id': 'book-title'})
 448         e.text = self.title
 449         body.insert(0, e)
 450         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 451         e.addnext(intro)
 452
 453         #0.5 adjust parameters to suit the particular kind of output
 454         if mode == 'web':
 455             self.maker.gutter = 0
 456
 457         #1. Save the html
 458         html_text = etree.tostring(self.tree, method="html")
 459         self.save_data(self.body_html_file, html_text)
 460
 461         #2. Make a pdf of it (direct to to final pdf)
 462         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
 463         self.notify_watcher('generate_pdf')
 464         n_pages = count_pdf_pages(self.pdf_file)
 465
 466         if mode != 'web':
 467             #3. resize pages and shift gutters.
 468             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 469             self.notify_watcher('reshape_pdf')
 470
 471             #4. add page numbers
 472             self.maker.number_pdf(self.pdf_file, n_pages,
 473                                   dir=self.dir, numbers=self.page_numbers)
 474             self.notify_watcher("number_pdf")
 475         self.notify_watcher()
 476
 477
 478     def rotate180(self):
 479         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 480         presses."""
 481         rotated = self.filepath('final-rotate.pdf')
 482         unrotated = self.filepath('final-pre-rotate.pdf')
 483         #leave the unrotated pdf intact at first, in case of error.
 484         rotate_pdf(self.pdf_file, rotated)
 485         os.rename(self.pdf_file, unrotated)
 486         os.rename(rotated, self.pdf_file)
 487         self.notify_watcher()
 488
 489     def publish_pdf(self):
 490         """Move the finished PDF to its final resting place"""
 491         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 492         os.rename(self.pdf_file, self.publish_file)
 493         self.notify_watcher()
 494
 495
 496     def concat_html(self):
 497         """Join all the chapters together into one tree.  Keep the TOC
 498         up-to-date along the way."""
 499
 500         #each manifest item looks like:
 501         #{'contributors': []
 502         #'license': [],
 503         #'mimetype': '',
 504         #'rightsholders': []
 505         #'url': ''}
 506         doc = lxml.html.document_fromstring('<html><body></body></html>')
 507         tocmap = filename_toc_map(self.toc)
 508         for ID in self.spine:
 509             details = self.manifest[ID]
 510             log(ID, pformat(details))
 511             root = self.get_tree_by_id(ID).getroot()
 512             #handle any TOC points in this file
 513             for point in tocmap[details['url']]:
 514                 #if the url has a #identifier, use it. Otherwise, make
 515                 #one up, using a hidden element at the beginning of
 516                 #the inserted document.
 517                 #XXX this will break if different files use the same ids
 518                 #XXX should either replace all, or replace selectively.
 519                 if point['fragment']:
 520                     fragment = point['fragment']
 521                 else:
 522                     body = _find_tag(root, 'body')
 523                     fragment = '%s_%s' % (self.cookie, point['index'])
 524                     #reuse first tag if it is suitable.
 525                     if (len(body) and
 526                         body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
 527                         if body[0].get('id') is None:
 528                             body[0].set('id', fragment)
 529                         else:
 530                             fragment = body[0].get('id')
 531                         #the chapter starts with a heading. that heading should be the chapter name.
 532                         if body[0].tag in ('h1', 'h2', 'h3'):
 533                             log('chapter has title "%s", found html title "%s"' %
 534                                 (point['title'], body[0].text_content()))
 535                             point['html_title'] = body[0].text_content()
 536                     else:
 537                         marker = body.makeelement('div', style="display:none",
 538                                                   id=fragment)
 539                         body.insert(0, marker)
 540                 point['html_id'] = fragment
 541
 542             add_guts(root, doc)
 543         return doc
 544
 545     def unpack_static(self):
 546         """Extract static files from the zip for the html to refer to."""
 547         static_files = [x['url'] for x in self.manifest.values()
 548                         if x['url'].startswith('static')]
 549         if static_files:
 550             os.mkdir(self.filepath('static'))
 551
 552         for name in static_files:
 553             s = self.store.read(name)
 554             f = open(self.filepath(name), 'w')
 555             f.write(s)
 556             f.close()
 557         self.notify_watcher()
 558
 559     def load_book(self):
 560         """"""
 561         #XXX concatenate the HTML to match how TWiki version worked.
 562         # This is perhaps foolishly early -- throwing away useful boundaries.
 563         self.unpack_static()
 564         self.tree = self.concat_html()
 565         self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
 566
 567         self.headings = [x for x in self.tree.cssselect('h1')]
 568         if self.headings:
 569             self.headings[0].set('class', "first-heading")
 570         for h1 in self.headings:
 571             h1.title = h1.text_content().strip()
 572         self.notify_watcher()
 573
 574     def make_contents(self):
 575         """Generate HTML containing the table of contents.  This can
 576         only be done after the main PDF has been made, because the
 577         page numbers are contained in the PDF outline."""
 578         header = '<h1>Table of Contents</h1><table class="toc">\n'
 579         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 580                     '<td class="pagenumber">%s</td></tr>\n')
 581         empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
 582         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 583         footer = '\n</table>'
 584
 585         contents = []
 586
 587         chapter = 1
 588         page_num = 1
 589         subsections = [] # for the subsection heading pages.
 590
 591         outline_contents = iter(self.outline_contents)
 592         headings = iter(self.headings)
 593
 594         for section in self.toc:
 595             if not section.get('children'):
 596                 contents.append(empty_section_tmpl % section['title'])
 597                 continue
 598             contents.append(section_tmpl % section['title'])
 599
 600             for point in section['children']:
 601                 try:
 602                     h1_text, level, page_num = outline_contents.next()
 603                 except StopIteration:
 604                     log("contents data not found for %s. Stopping" % (point,))
 605                     break
 606                 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
 607                 chapter += 1
 608
 609         doc = header + '\n'.join(contents) + footer
 610         self.notify_watcher()
 611         return doc
 612
 613     def add_section_titles(self):
 614         """Add any section heading pages that the TOC.txt file
 615         specifies.  These are sub-book, super-chapter groupings.
 616
 617         Also add initial numbers to chapters.
 618         """
 619         headings = iter(self.headings)
 620         chapter = 1
 621         section = None
 622         log(self.toc)
 623         for t in self.toc:
 624             #only top level sections get a subsection page,
 625             #and only if they have children.
 626             if t.get('children'):
 627                 section = self.tree.makeelement('div', Class="objavi-subsection")
 628                 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
 629                 heading.text = t['title']
 630                 for child in t['children']:
 631                     item = etree.SubElement(section, 'div', Class="objavi-chapter")
 632                     if 'html_title' in child:
 633                         item.text = child['html_title']
 634                         heading = self.tree.cssselect('#'+ child['html_id'])
 635                         if heading:
 636                             _add_initial_number(heading[0], chapter)
 637                     else:
 638                         item.text = child['title']
 639                     _add_initial_number(item, chapter)
 640                     log(item.text, debug='HTMLGEN')
 641                     chapter += 1
 642                 location = self.tree.cssselect('#'+ t['html_id'])[0]
 643                 location.addprevious(section)
 644
 645
 646         self.notify_watcher()
 647
 648
 649     def add_css(self, css=None, mode='book'):
 650         """If css looks like a url, use it as a stylesheet link.
 651         Otherwise it is the CSS itself, which is saved to a temporary file
 652         and linked to."""
 653         log("css is %r" % css)
 654         htmltree = self.tree
 655         if css is None or not css.strip():
 656             defaults = config.SERVER_DEFAULTS[self.server]
 657             url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
 658         elif not re.match(r'^http://\S+$', css):
 659             fn = self.save_tempfile('objavi.css', css)
 660             url = 'file://' + fn
 661         else:
 662             url = css
 663         #XXX for debugging and perhaps sensible anyway
 664         #url = url.replace('file:///home/douglas/objavi2', '')
 665
 666
 667         #find the head -- it's probably first child but lets not assume.
 668         for child in htmltree:
 669             if child.tag == 'head':
 670                 head = child
 671                 break
 672         else:
 673             head = htmltree.makeelement('head')
 674             htmltree.insert(0, head)
 675
 676         link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 677         self.css_url = url
 678         self.notify_watcher()
 679         return url
 680
 681
 682     def _read_localised_template(self, template, fallbacks=['en']):
 683         """Try to get the template in the approriate language, otherwise in english."""
 684         for lang in [self.lang] + fallbacks:
 685             try:
 686                 fn = template % (lang)
 687                 f = open(fn)
 688                 break
 689             except IOError, e:
 690                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 691                 log(e)
 692         template = f.read()
 693         f.close()
 694         return template
 695
 696     def compose_inside_cover(self):
 697         """create the markup for the preamble inside cover."""
 698         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 699
 700         if self.isbn:
 701             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 702         else:
 703             isbn_text = ''
 704
 705         return template % {'date': time.strftime('%Y-%m-%d'),
 706                            'isbn': isbn_text,
 707                            'license': self.license,
 708                            }
 709
 710
 711     def compose_end_matter(self):
 712         """create the markup for the end_matter inside cover.  If
 713         self.isbn is not set, the html will result in a pdf that
 714         spills onto two pages.
 715         """
 716         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 717
 718         d = {'css_url': self.css_url,
 719              'title': self.title
 720              }
 721
 722         if self.isbn:
 723             d['inside_cover_style'] = ''
 724         else:
 725             d['inside_cover_style'] = 'page-break-after: always'
 726
 727         return template % d
 728
 729
 730     def make_epub(self, use_cache=False):
 731         """Make an epub version of the book, using Mike McCabe's
 732         epub module for the Internet Archive."""
 733         ebook = ia_epub.Book(self.publish_file, content_dir='')
 734         toc = self.info['TOC']
 735
 736         #manifest
 737         filemap = {} #map html to corresponding xhtml
 738         for ID in self.manifest:
 739             details = self.manifest[ID]
 740             log(ID, pformat(details))
 741             fn, mediatype = details['url'], details['mimetype']
 742             oldfn = fn
 743             content = self.store.read(fn)
 744             if mediatype == 'text/html':
 745                 log('CONVERTING')
 746                 #convert to application/xhtml+xml
 747                 c = EpubChapter(self.server, self.book, ID, content,
 748                                 use_cache=use_cache)
 749                 c.remove_bad_tags()
 750                 c.prepare_for_epub()
 751                 content = c.as_xhtml()
 752                 if fn[-5:] == '.html':
 753                     fn = fn[:-5]
 754                 fn += '.xhtml'
 755                 mediatype = 'application/xhtml+xml'
 756                 filemap[oldfn] = fn
 757
 758             info = {'id': ID.encode('utf-8'),
 759                     'href': fn.encode('utf-8'),
 760                     'media-type': mediatype.encode('utf-8')}
 761             ebook.add_content(info, content)
 762
 763         #toc
 764         ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
 765         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 766
 767         #spine
 768         for ID in self.spine:
 769             ebook.add_spine_item({'idref': ID})
 770
 771         #metadata -- no use of attributes (yet)
 772         # and fm: metadata disappears for now
 773         DCNS = config.DCNS
 774         DC = config.DC
 775         meta_info_items = []
 776         for ns, namespace in self.metadata.items():
 777             for keyword, schemes in namespace.items():
 778                 if ns:
 779                     keyword = '{%s}%s' % (ns, keyword)
 780                 for scheme, values in schemes.items():
 781                     for value in values:
 782                         item = {
 783                             'item': keyword,
 784                             'text': value,
 785                             }
 786                         if scheme:
 787                             if keyword in (DCNS + 'creator', DCNS + 'contributor'):
 788                                 item['atts'] = {'role': scheme}
 789                             else:
 790                                 item['atts'] = {'scheme': scheme}
 791
 792         has_authors = 'creator' in self.metadata[DC]
 793         if not has_authors and config.CLAIM_UNAUTHORED:
 794             meta_info_items.append({'item': DCNS + 'creator',
 795                                     'text': 'The Contributors'})
 796
 797             meta_info_items.append({'item': DCNS + 'rights',
 798                                     'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 799                                    )
 800
 801         tree_str = ia_epub.make_opf(meta_info_items,
 802                                     ebook.manifest_items,
 803                                     ebook.spine_items,
 804                                     ebook.guide_items,
 805                                     ebook.cover_id)
 806         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 807         ebook.z.close()
 808
 809
 810     def publish_s3(self):
 811         """Push the book's epub to archive.org, using S3."""
 812         #XXX why only epub?
 813         secrets = {}
 814         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 815             fn = getattr(config, x)
 816             f = open(fn)
 817             secrets[x] = f.read().strip()
 818             f.close()
 819
 820         log(secrets)
 821         now = time.strftime('%F')
 822         s3output = self.filepath('s3-output.txt')
 823         s3url = 'http://s3.us.archive.org/booki-%s-%s/%s' % (self.project, self.book, self.bookname)
 824         detailsurl = 'http://archive.org/details/booki-%s-%s' % (self.project, self.book)
 825         headers = [
 826             'x-amz-auto-make-bucket:1',
 827             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
 828             'x-archive-meta-mediatype:texts',
 829             'x-archive-meta-collection:opensource',
 830             'x-archive-meta-title:%s' %(self.book,),
 831             'x-archive-meta-date:%s' % (now,),
 832             'x-archive-meta-creator:FLOSS Manuals Contributors',
 833             ]
 834
 835         if self.license in config.LICENSES:
 836             headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
 837
 838         argv = ['curl', '--location', '-s', '-o', s3output]
 839         for h in headers:
 840             argv.extend(('--header', h))
 841         argv.extend(('--upload-file', self.publish_file, s3url,))
 842
 843         log(' '.join(repr(x) for x in argv))
 844         check_call(argv, stdout=sys.stderr)
 845         return detailsurl, s3url
 846
 847
 848     def spawn_x(self):
 849         """Start an Xvfb instance, using a new server number.  A
 850         reference to it is stored in self.xvfb, which is used to kill
 851         it when the pdf is done.
 852
 853         Note that Xvfb doesn't interact well with dbus which is
 854         present on modern desktops.
 855         """
 856         #Find an unused server number (in case two cgis are running at once)
 857         while True:
 858             servernum = random.randrange(50, 500)
 859             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 860                 break
 861
 862         self.xserver_no = ':%s' % servernum
 863
 864         authfile = self.filepath('Xauthority')
 865         os.environ['XAUTHORITY'] = authfile
 866
 867         #mcookie(1) eats into /dev/random, so avoid that
 868         from hashlib import md5
 869         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 870         mcookie = m.hexdigest()
 871
 872         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 873
 874         self.xvfb = Popen(['Xvfb', self.xserver_no,
 875                            '-screen', '0', '1024x768x24',
 876                            '-pixdepths', '32',
 877                            #'-blackpixel', '0',
 878                            #'-whitepixel', str(2 ** 24 -1),
 879                            #'+extension', 'Composite',
 880                            '-dpi', '96',
 881                            '-kb',
 882                            '-nolisten', 'tcp',
 883                            ])
 884
 885         # We need to wait a bit before the Xvfb is ready.  but the
 886         # downloads are so slow that that probably doesn't matter
 887
 888         self.xvfb_ready_time = time.time() + 2
 889
 890         os.environ['DISPLAY'] = self.xserver_no
 891         log(self.xserver_no)
 892
 893     def wait_for_xvfb(self):
 894         """wait until a previously set time before continuing.  This
 895         is so Xvfb has time to properly start."""
 896         if hasattr(self, 'xvfb'):
 897             d = self.xvfb_ready_time - time.time()
 898             if d > 0:
 899                 time.sleep(d)
 900                 self.notify_watcher()
 901
 902     def cleanup_x(self):
 903         """Try very hard to kill off Xvfb.  In addition to killing
 904         this instance's xvfb, occasionally (randomly) search for
 905         escaped Xvfb instances and kill those too."""
 906         if not hasattr(self, 'xvfb'):
 907             return
 908         check_call(['xauth', 'remove', self.xserver_no])
 909         p = self.xvfb
 910         log("trying to kill Xvfb %s" % p.pid)
 911         os.kill(p.pid, 15)
 912         for i in range(10):
 913             if p.poll() is not None:
 914                 log("%s died with %s" % (p.pid, p.poll()))
 915                 break
 916             log("%s not dead yet" % p.pid)
 917             time.sleep(0.2)
 918         else:
 919             log("Xvfb would not die! kill -9! kill -9!")
 920             os.kill(p.pid, 9)
 921
 922         if random.random() < 0.1:
 923             # occasionally kill old xvfbs and soffices, if there are any.
 924             self.kill_old_processes()
 925
 926     def kill_old_processes(self):
 927         """Sometimes, despite everything, Xvfb or soffice instances
 928         hang around well after they are wanted -- for example if the
 929         cgi process dies particularly badly. So kill them if they have
 930         been running for a long time."""
 931         log("running kill_old_processes")
 932         p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
 933                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 934         data = p.communicate()[0].strip()
 935         if data:
 936             lines = data.split('\n')
 937             pids = []
 938             for line in lines:
 939                 log('dealing with ps output "%s"' % line)
 940                 try:
 941                     pid, days, hours, minutes, seconds \
 942                          = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
 943                 except AttributeError:
 944                     log("Couldn't parse that line!")
 945                 # 50 minutes should be enough xvfb time for anyone
 946                 if days or hours or int(minutes) > 50:
 947                     pid = int(pid)
 948                     log("going to kill pid %s" % pid)
 949                     os.kill(pid, 15)
 950                     pids.append(pid)
 951
 952             time.sleep(1.0)
 953             for pid in pids:
 954                 #try again in case any are lingerers
 955                 try:
 956                     os.kill(int(pid), 9)
 957                 except OSError, e:
 958                     log('PID %s seems dead (re-kill gives %s)' % (pid, e))
 959                     continue
 960                 log('killing %s with -9' % pid)
 961         self.notify_watcher()
 962
 963     def cleanup(self):
 964         self.cleanup_x()
 965         if not config.KEEP_TEMP_FILES:
 966             for fn in os.listdir(self.workdir):
 967                 os.remove(os.path.join(self.workdir, fn))
 968             os.rmdir(self.workdir)
 969         else:
 970             log("NOT removing '%s', containing the following files:" % self.workdir)
 971             log(*os.listdir(self.workdir))
 972
 973         self.notify_watcher()
 974
 975
 976 def use_cache():
 977     return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
 978
 979 def _read_cached_zip(server, book, max_age):
 980     #find a recent zip if possible
 981     prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
 982     from glob import glob
 983     zips = sorted(glob(prefix + '*.zip'))
 984     if not zips:
 985         log("no cached booki-zips matching %s*.zip" % (prefix,))
 986         return None
 987     zipname = zips[-1]
 988     cutoff = time.time() - max_age * 60
 989     log(repr(zipname))
 990     try:
 991         date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
 992         if date > cutoff:
 993             f = open(zipname)
 994             blob = f.read()
 995             f.close()
 996             return blob
 997         log("%s is too old, must reload" % zipname)
 998         return None
 999     except (IOError, IndexError, ValueError), e:
1000         log('could not make sense of %s: got exception %s' % (zipname, e))
1001         return None
1002
1003
1004
1005 def fetch_zip(server, book, project, save=False, max_age=-1):
1006     interface = config.SERVER_DEFAULTS[server]['interface']
1007     if interface not in ('Booki', 'TWiki'):
1008         raise NotImplementedError("Can't handle '%s' interface" % interface)
1009     if interface == 'Booki':
1010         url = config.BOOKI_ZIP_URL  % {'server': server, 'project': project, 'book':book}
1011     else:
1012         url = config.TWIKI_GATEWAY_URL % (HTTP_HOST, server, book)
1013
1014     if use_cache() and max_age < 0:
1015         #default to 12 hours cache on objavi.halo.gen.nz
1016         max_age = 12 * 60
1017
1018     if max_age:
1019         log('WARNING: trying to use cached booki-zip',
1020             'If you are debugging booki-zip creation, you will go CRAZY'
1021             ' unless you switch this off')
1022         blob = _read_cached_zip(server, book, max_age)
1023         if blob is not None:
1024             return blob
1025
1026     log('fetching zip from %s'% url)
1027     f = urlopen(url)
1028     blob = f.read()
1029     f.close()
1030     if save:
1031         zipname = make_book_name(book, server, '.zip')
1032         f = open('%s/%s' % (config.BOOKI_BOOK_DIR, zipname), 'w')
1033         f.write(blob)
1034         f.close()
1035     return blob
1036