objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 from subprocess import Popen, check_call, PIPE
  28 from cStringIO import StringIO
  29 from urllib2 import urlopen, HTTPError
  30 import zipfile
  31 import traceback
  32 from string import ascii_letters
  33 from pprint import pformat
  34
  35 try:
  36     import simplejson as json
  37 except ImportError:
  38     import json
  39
  40 import lxml.html
  41 from lxml import etree
  42
  43 from objavi import config, epub_utils
  44 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
  45 from objavi.book_utils import ObjaviError
  46 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
  47 from objavi.epub import add_guts, _find_tag
  48 from objavi.xhtml_utils import EpubChapter, split_tree
  49
  50 from iarchive import epub as ia_epub
  51 from booki.bookizip import get_metadata, add_metadata
  52
  53 TMPDIR = os.path.abspath(config.TMPDIR)
  54 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
  55 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  56
  57 def find_archive_urls(bookid, bookname):
  58     s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
  59     detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
  60     return (s3url, detailsurl)
  61
  62 def _get_best_title(tocpoint):
  63     if 'html_title' in tocpoint:
  64         return tocpoint['html_title']
  65     if 'title' in tocpoint:
  66         return tocpoint['title']
  67     return 'Untitled'
  68
  69
  70 def _add_initial_number(e, n):
  71     """Put a styled chapter number n at the beginning of element e."""
  72     initial = e.makeelement("strong", Class="initial")
  73     e.insert(0, initial)
  74     initial.tail = ' '
  75     if e.text is not None:
  76         initial.tail += e.text
  77     e.text = ''
  78     initial.text = "%s." % n
  79
  80 def expand_toc(toc, depth=1, index=0):
  81     """Reformat toc slightly for convenience"""
  82     for item in toc:
  83         url = item['url'].lstrip('/')
  84         bits = url.split('#', 1)
  85         filename = bits[0]
  86         fragment = (bits[1] if len(bits) == 2 else None)
  87         item['depth'] = depth
  88         item["filename"] = filename
  89         item["fragment"] = fragment
  90         item["index"] = index
  91         index += 1
  92         if 'children' in item:
  93             index = expand_toc(item['children'], depth + 1, index)
  94     return index
  95
  96 def _serialise(rtoc, stoc, depth):
  97     for item in rtoc:
  98         url = item['url'].lstrip('/')
  99         bits = url.split('#', 1)
 100         filename = bits[0]
 101         fragment = (bits[1] if len(bits) == 2 else None)
 102         stoc.append({"depth": depth,
 103                      "title": item['title'],
 104                      "url": url,
 105                      "filename": filename,
 106                      "fragment": fragment,
 107                      "type": item['type']
 108                      })
 109         if 'children' in item:
 110             _serialise(item['children'], stoc, depth + 1)
 111
 112
 113 def serialise_toc(rtoc):
 114     """Take the recursive TOC structure and turn it into a list of
 115     serial points.  Reformat some things for convenience."""
 116     stoc = []
 117     _serialise(rtoc, stoc, 1)
 118     for i, x in enumerate(stoc):
 119         x['position'] = i
 120     return stoc
 121
 122 def filename_toc_map(rtoc):
 123     tocmap = {}
 124     log(rtoc)
 125     def traverse(toc):
 126         for point in toc:
 127             log(point.keys())
 128             tocmap.setdefault(point['filename'], []).append(point)
 129             if 'children' in point:
 130                 traverse(point['children'])
 131     traverse(rtoc)
 132     return tocmap
 133
 134 def save_data(fn, data):
 135     """Save without tripping up on unicode"""
 136     if isinstance(data, unicode):
 137         data = data.encode('utf8', 'ignore')
 138     f = open(fn, 'w')
 139     f.write(data)
 140     f.close()
 141
 142
 143 class Book(object):
 144     page_numbers = 'latin'
 145     preamble_page_numbers = 'roman'
 146
 147     def notify_watcher(self, message=None):
 148         if self.watchers:
 149             if  message is None:
 150                 #message is the name of the caller
 151                 message = traceback.extract_stack(None, 2)[0][2]
 152             log("notify_watcher called with '%s'" % message)
 153             for w in self.watchers:
 154                 w(message)
 155
 156     def __enter__(self):
 157         return self
 158
 159     def __exit__(self, exc_type, exc_value, tb):
 160         self.notify_watcher(config.FINISHED_MESSAGE)
 161         self.cleanup()
 162         #could deal with exceptions here and return true
 163
 164
 165     def __init__(self, book, server, bookname,
 166                  page_settings=None, watchers=None, isbn=None,
 167                  license=config.DEFAULT_LICENSE, title=None,
 168                  max_age=0):
 169         log("*** Starting new book %s ***" % bookname)
 170         self.watchers = set()
 171         if watchers is not None:
 172             self.watchers.update(watchers)
 173         self.notify_watcher('start')
 174         self.bookname = bookname
 175         self.book = book
 176         self.server = server
 177         self.cookie = ''.join(random.sample(ascii_letters, 10))
 178         try:
 179             blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
 180         except HTTPError, e:
 181             traceback.print_exc()
 182             self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
 183             #not much to do?
 184             #raise 502 Bad Gateway ?
 185             sys.exit()
 186         f = StringIO(blob)
 187         self.notify_watcher('fetch_zip')
 188         self.store = zipfile.ZipFile(f, 'r')
 189         self.info = json.loads(self.store.read('info.json'))
 190         for k in ('manifest', 'metadata', 'spine', 'TOC'):
 191             if k not in self.info:
 192                 raise ObjaviError('info.json of %s lacks vital element "%s"' %
 193                                   (bookname, k))
 194             #check types also?
 195
 196         self.metadata = self.info['metadata']
 197         self.spine = self.info['spine']
 198         self.manifest = self.info['manifest']
 199
 200         if server == config.LOCALHOST: # [DEPRECATED]
 201             server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
 202             book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
 203
 204         log(pformat(self.metadata))
 205         self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
 206         if not self.lang:
 207             self.lang = guess_lang(server, book)
 208             log('guessed lang as %s' % self.lang)
 209
 210         self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
 211         if not self.toc_header:
 212             self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
 213
 214         self.dir = get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0]
 215         if not self.dir:
 216             self.dir = guess_text_dir(server, book)
 217
 218
 219         #Patch in the extra metadata. (lang and dir may be set from config)
 220         #these should be read from zip -- so should go into zip?
 221         for var, key, scheme, ns in (
 222             (isbn, 'id', 'ISBN', config.DC),
 223             (license, 'rights', 'License', config.DC),
 224             (title, 'title', '', config.DC),
 225             (self.lang, 'language', '', config.DC),
 226             (self.dir, 'dir', '', config.FM),
 227             ):
 228             if var is not None:
 229                 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
 230
 231         self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
 232         self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
 233
 234         self.toc = self.info['TOC']
 235         expand_toc(self.toc)
 236
 237         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 238         os.chmod(self.workdir, 0755)
 239
 240         self.body_html_file = self.filepath('body.html')
 241         self.body_pdf_file = self.filepath('body.pdf')
 242         self.preamble_html_file = self.filepath('preamble.html')
 243         self.preamble_pdf_file = self.filepath('preamble.pdf')
 244         self.tail_html_file = self.filepath('tail.html')
 245         self.tail_pdf_file = self.filepath('tail.pdf')
 246         self.isbn_pdf_file = None
 247         self.pdf_file = self.filepath('final.pdf')
 248         self.body_odt_file = self.filepath('body.odt')
 249
 250         self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
 251
 252         if page_settings is not None:
 253             self.maker = PageSettings(**page_settings)
 254
 255         if title is not None:
 256             self.title = title
 257         else:
 258             titles = get_metadata(self.metadata, 'title')
 259             if titles:
 260                 self.title = titles[0]
 261             else:
 262                 self.title = 'A Book About ' + self.book
 263
 264         self.notify_watcher()
 265
 266
 267     if config.TRY_BOOK_CLEANUP_ON_DEL:
 268         #Dont even define __del__ if it is not used.
 269         _try_cleanup_on_del = True
 270         def __del__(self):
 271             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 272                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 273                 self.cleanup()
 274
 275     def get_tree_by_id(self, id):
 276         """get an HTML tree from the given manifest ID"""
 277         name = self.manifest[id]['url']
 278         mimetype = self.manifest[id]['mimetype']
 279         s = self.store.read(name)
 280         f = StringIO(s)
 281         if mimetype == 'text/html':
 282             try:
 283                 tree = lxml.html.parse(f)
 284             except etree.XMLSyntaxError, e:
 285                 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
 286                     (id, name, s[:20], e))
 287                 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 288         elif 'xml' in mimetype: #XXX or is this just asking for trouble?
 289             tree = etree.parse(f)
 290         else:
 291             tree = f.read()
 292         f.close()
 293         return tree
 294
 295     def filepath(self, fn):
 296         return os.path.join(self.workdir, fn)
 297
 298     def save_tempfile(self, fn, data):
 299         """Save the data in a temporary directory that will be cleaned
 300         up when all is done.  Return the absolute file path."""
 301         fn = self.filepath(fn)
 302         save_data(fn, data)
 303         return fn
 304
 305     def make_oo_doc(self):
 306         """Make an openoffice document, using the html2odt script."""
 307         self.wait_for_xvfb()
 308         html_text = etree.tostring(self.tree, method="html")
 309         save_data(self.body_html_file, html_text)
 310         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 311         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 312         os.rename(self.body_odt_file, self.publish_file)
 313         self.notify_watcher()
 314
 315     def extract_pdf_outline(self):
 316         #self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
 317         debugf = self.filepath('outline.txt')
 318         self.outline_contents, self.outline_text, number_of_pages = \
 319                 parse_outline(self.body_pdf_file, 1, debugf)
 320
 321         if not self.outline_contents:
 322             #probably problems with international text. need a horrible hack
 323             log('no outline: trying again with ascii headings')
 324             import copy
 325             tree = copy.deepcopy(self.tree)
 326             titlemap = {}
 327             for tag in ('h1', 'h2', 'h3', 'h4'):
 328                 for i, e in enumerate(tree.getiterator(tag)):
 329                     key = "%s_%s" % (tag, i)
 330                     titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
 331                     del e[:]
 332                     if tag == 'h1':
 333                         e = lxml.etree.SubElement(e, "strong", Class="initial")
 334                     e.text = key
 335                     log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
 336
 337             ascii_html_file = self.filepath('body-ascii-headings.html')
 338             ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
 339             html_text = lxml.etree.tostring(tree, method="html")
 340             save_data(ascii_html_file, html_text)
 341             self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
 342             debugf = self.filepath('ascii_outline.txt')
 343             ascii_contents, ascii_text, number_of_ascii_pages = \
 344                 parse_outline(ascii_pdf_file, 1, debugf)
 345             self.outline_contents = []
 346             log ("number of pages: %s, post ascii: %s" %
 347                  (number_of_pages, number_of_ascii_pages))
 348             for ascii_title, depth, pageno in ascii_contents:
 349                 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
 350                     ascii_title = ascii_title[:-4]
 351                 if ' ' in ascii_title:
 352                     ascii_title = ascii_title.rsplit(' ', 1)[1]
 353                 title = titlemap.get(ascii_title, '')
 354                 log((ascii_title, title, depth, pageno))
 355
 356                 self.outline_contents.append((title, depth, pageno))
 357         else:
 358             for x in self.outline_contents:
 359                 log(x)
 360
 361         self.notify_watcher()
 362         return number_of_pages
 363
 364     def make_body_pdf(self):
 365         """Make a pdf of the HTML, using webkit"""
 366         #1. Save the html
 367         html_text = etree.tostring(self.tree, method="html")
 368         save_data(self.body_html_file, html_text)
 369
 370         #2. Make a pdf of it
 371         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
 372         self.notify_watcher('generate_pdf')
 373
 374         n_pages = self.extract_pdf_outline()
 375
 376         log ("found %s pages in pdf" % n_pages)
 377         #4. resize pages, shift gutters, even pages
 378         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 379         self.notify_watcher('reshape_pdf')
 380
 381         #5 add page numbers
 382         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 383                               numbers=self.page_numbers)
 384         self.notify_watcher("number_pdf")
 385         self.notify_watcher()
 386
 387     def make_preamble_pdf(self):
 388         contents = self.make_contents()
 389         inside_cover_html = self.compose_inside_cover()
 390         log(self.dir, self.css_url, self.title, inside_cover_html,
 391             self.toc_header, contents, self.title)
 392
 393         html = ('<html dir="%s"><head>\n'
 394                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 395                 '<link rel="stylesheet" href="%s" />\n'
 396                 '</head>\n<body>\n'
 397                 '<h1 class="frontpage">%s</h1>'
 398                 '%s\n'
 399                 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
 400                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 401                 '<!--%s--></div></body></html>'
 402                 ) % (self.dir, self.css_url, self.title, inside_cover_html.decode('utf-8'),
 403                      self.toc_header, contents, self.title)
 404         save_data(self.preamble_html_file, html)
 405
 406         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 407
 408         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 409
 410         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 411                             numbers=self.preamble_page_numbers,
 412                             number_start=-2)
 413
 414         self.notify_watcher()
 415
 416     def make_end_matter_pdf(self):
 417         """Make an inside back cover and a back cover.  If there is an
 418         isbn number its barcode will be put on the back cover."""
 419         if self.isbn:
 420             self.isbn_pdf_file = self.filepath('isbn.pdf')
 421             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 422             self.notify_watcher('make_barcode_pdf')
 423
 424         end_matter = self.compose_end_matter()
 425         log(end_matter)
 426         save_data(self.tail_html_file, end_matter.decode('utf-8'))
 427         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 428
 429         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 430                                centre_end=True, even_pages=False)
 431         self.notify_watcher()
 432
 433     def make_book_pdf(self):
 434         """A convenient wrapper of a few necessary steps"""
 435         # now the Xvfb server is needed. make sure it has had long enough to get going
 436         self.wait_for_xvfb()
 437         self.make_body_pdf()
 438         self.make_preamble_pdf()
 439         self.make_end_matter_pdf()
 440
 441         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 442                     self.body_pdf_file, self.tail_pdf_file,
 443                     self.isbn_pdf_file)
 444
 445         self.notify_watcher('concatenated_pdfs')
 446
 447
 448     def make_simple_pdf(self, mode):
 449         """Make a simple pdf document without contents or separate
 450         title page.  This is used for multicolumn newspapers and for
 451         web-destined pdfs."""
 452         self.wait_for_xvfb()
 453         #0. Add heading to begining of html
 454         body = list(self.tree.cssselect('body'))[0]
 455         e = body.makeelement('h1', {'id': 'book-title'})
 456         e.text = self.title
 457         body.insert(0, e)
 458         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 459         e.addnext(intro)
 460
 461         #0.5 adjust parameters to suit the particular kind of output
 462         if mode == 'web':
 463             self.maker.gutter = 0
 464
 465         #1. Save the html
 466         html_text = etree.tostring(self.tree, method="html")
 467         save_data(self.body_html_file, html_text)
 468
 469         #2. Make a pdf of it (direct to to final pdf)
 470         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
 471         self.notify_watcher('generate_pdf')
 472         n_pages = count_pdf_pages(self.pdf_file)
 473
 474         if mode != 'web':
 475             #3. resize pages and shift gutters.
 476             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 477             self.notify_watcher('reshape_pdf')
 478
 479             #4. add page numbers
 480             self.maker.number_pdf(self.pdf_file, n_pages,
 481                                   dir=self.dir, numbers=self.page_numbers)
 482             self.notify_watcher("number_pdf")
 483         self.notify_watcher()
 484
 485
 486     def rotate180(self):
 487         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 488         presses."""
 489         rotated = self.filepath('final-rotate.pdf')
 490         unrotated = self.filepath('final-pre-rotate.pdf')
 491         #leave the unrotated pdf intact at first, in case of error.
 492         rotate_pdf(self.pdf_file, rotated)
 493         os.rename(self.pdf_file, unrotated)
 494         os.rename(rotated, self.pdf_file)
 495         self.notify_watcher()
 496
 497     def publish_pdf(self):
 498         """Move the finished PDF to its final resting place"""
 499         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 500         os.rename(self.pdf_file, self.publish_file)
 501         self.notify_watcher()
 502
 503     def publish_bookizip(self):
 504         """Publish the bookizip.  For this, copy rather than move,
 505         because the bookizip might be used by further processing.  If
 506         possible, a hard link is created."""
 507         log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
 508         try:
 509             run(['cp', '-l', self.bookizip_file, self.publish_file])
 510         except OSError:
 511             run(['cp', self.bookizip_file, self.publish_file])
 512         self.notify_watcher()
 513
 514     def concat_html(self):
 515         """Join all the chapters together into one tree.  Keep the TOC
 516         up-to-date along the way."""
 517
 518         #each manifest item looks like:
 519         #{'contributors': []
 520         #'license': [],
 521         #'mimetype': '',
 522         #'rightsholders': []
 523         #'url': ''}
 524         doc = lxml.html.document_fromstring('<html><body></body></html>')
 525         tocmap = filename_toc_map(self.toc)
 526         for ID in self.spine:
 527             details = self.manifest[ID]
 528             log(ID, pformat(details))
 529             # ACO MIJENJAO
 530             try:
 531                 root = self.get_tree_by_id(ID).getroot()
 532             except Exception, e:
 533                 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
 534                 continue
 535             #handle any TOC points in this file
 536             for point in tocmap[details['url']]:
 537                 #if the url has a #identifier, use it. Otherwise, make
 538                 #one up, using a hidden element at the beginning of
 539                 #the inserted document.
 540                 #XXX this will break if different files use the same ids
 541                 #XXX should either replace all, or replace selectively.
 542                 if point['fragment']:
 543                     fragment = point['fragment']
 544                 else:
 545                     body = _find_tag(root, 'body')
 546                     fragment = '%s_%s' % (self.cookie, point['index'])
 547                     #reuse first tag if it is suitable.
 548                     if (len(body) and
 549                         body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
 550                         if body[0].get('id') is None:
 551                             body[0].set('id', fragment)
 552                         else:
 553                             fragment = body[0].get('id')
 554                         #the chapter starts with a heading. that heading should be the chapter name.
 555                         if body[0].tag in ('h1', 'h2', 'h3'):
 556                             log('chapter has title "%s", found html title "%s"' %
 557                                 (point['title'], body[0].text_content()))
 558                             point['html_title'] = body[0].text_content()
 559                     else:
 560                         marker = body.makeelement('div', style="display:none",
 561                                                   id=fragment)
 562                         body.insert(0, marker)
 563                 point['html_id'] = fragment
 564
 565             add_guts(root, doc)
 566         return doc
 567
 568     def unpack_static(self):
 569         """Extract static files from the zip for the html to refer to."""
 570         static_files = [x['url'] for x in self.manifest.values()
 571                         if x['url'].startswith('static')]
 572         if static_files:
 573             os.mkdir(self.filepath('static'))
 574
 575         for name in static_files:
 576             s = self.store.read(name)
 577             f = open(self.filepath(name), 'w')
 578             f.write(s)
 579             f.close()
 580         self.notify_watcher()
 581
 582     def load_book(self):
 583         """"""
 584         #XXX concatenate the HTML to match how TWiki version worked.
 585         # This is perhaps foolishly early -- throwing away useful boundaries.
 586         self.unpack_static()
 587         self.tree = self.concat_html()
 588         self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
 589
 590         self.headings = [x for x in self.tree.cssselect('h1')]
 591         if self.headings:
 592             self.headings[0].set('class', "first-heading")
 593         for h1 in self.headings:
 594             h1.title = h1.text_content().strip()
 595         self.notify_watcher()
 596
 597     def make_contents(self):
 598         """Generate HTML containing the table of contents.  This can
 599         only be done after the main PDF has been made, because the
 600         page numbers are contained in the PDF outline."""
 601         header = '<h1>Table of Contents</h1><table class="toc">\n'
 602         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 603                     '<td class="pagenumber">%s</td></tr>\n')
 604         empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
 605         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 606         footer = '\n</table>'
 607
 608         contents = []
 609
 610         chapter = 1
 611         page_num = 1
 612
 613         outline_contents = iter(self.outline_contents)
 614
 615         for section in self.toc:
 616             if not section.get('children'):
 617                 contents.append(empty_section_tmpl % section['title'])
 618                 continue
 619             contents.append(section_tmpl % section['title'])
 620
 621             for point in section['children']:
 622                 try:
 623                     h1_text, level, page_num = outline_contents.next()
 624                 except StopIteration:
 625                     log("contents data not found for %s. Stopping" % (point,))
 626                     break
 627                 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
 628                 chapter += 1
 629
 630         doc = header + '\n'.join(contents) + footer
 631         self.notify_watcher()
 632         return doc
 633
 634     def add_section_titles(self):
 635         """Add any section heading pages that the TOC.txt file
 636         specifies.  These are sub-book, super-chapter groupings.
 637
 638         Also add initial numbers to chapters.
 639         """
 640         chapter = 1
 641         section = None
 642         log(self.toc)
 643         for t in self.toc:
 644             #only top level sections get a subsection page,
 645             #and only if they have children.
 646             if t.get('children'):
 647                 section = self.tree.makeelement('div', Class="objavi-subsection")
 648                 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
 649                 heading.text = t['title']
 650                 for child in t['children']:
 651                     item = etree.SubElement(section, 'div', Class="objavi-chapter")
 652                     if 'html_title' in child:
 653                         item.text = child['html_title']
 654                         heading = self.tree.cssselect('#'+ child['html_id'])
 655                         if heading:
 656                             _add_initial_number(heading[0], chapter)
 657                     else:
 658                         item.text = child['title']
 659                     _add_initial_number(item, chapter)
 660                     log(item.text, debug='HTMLGEN')
 661                     chapter += 1
 662                 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
 663                 location = self.tree.cssselect('#'+ t['html_id'])[0]
 664                 location.addprevious(section)
 665
 666
 667         self.notify_watcher()
 668
 669
 670     def add_css(self, css=None, mode='book'):
 671         """If css looks like a url, use it as a stylesheet link.
 672         Otherwise it is the CSS itself, which is saved to a temporary file
 673         and linked to."""
 674         log("css is %r" % css)
 675         htmltree = self.tree
 676         if css is None or not css.strip():
 677             css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
 678             if css_default is None:
 679                 #guess from language -- this should come first
 680                 css_modes = config.LANGUAGE_CSS.get(self.lang,
 681                                                     config.LANGUAGE_CSS['en'])
 682                 css_default = css_modes.get(mode, css_modes[None])
 683             url = 'file://' + os.path.abspath(css_default)
 684         elif not re.match(r'^http://\S+$', css):
 685             fn = self.save_tempfile('objavi.css', css)
 686             url = 'file://' + fn
 687         else:
 688             url = css
 689         #XXX for debugging and perhaps sensible anyway
 690         #url = url.replace('file:///home/douglas/objavi2', '')
 691
 692
 693         #find the head -- it's probably first child but lets not assume.
 694         for child in htmltree:
 695             if child.tag == 'head':
 696                 head = child
 697                 break
 698         else:
 699             head = htmltree.makeelement('head')
 700             htmltree.insert(0, head)
 701
 702         link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 703         self.css_url = url
 704         self.notify_watcher()
 705         return url
 706
 707
 708     def _read_localised_template(self, template, fallbacks=['en']):
 709         """Try to get the template in the approriate language, otherwise in english."""
 710         for lang in [self.lang] + fallbacks:
 711             try:
 712                 fn = template % (lang)
 713                 f = open(fn)
 714                 break
 715             except IOError, e:
 716                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 717                 log(e)
 718         template = f.read()
 719         f.close()
 720         return template
 721
 722     def compose_inside_cover(self):
 723         """create the markup for the preamble inside cover."""
 724         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 725
 726         if self.isbn:
 727             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 728         else:
 729             isbn_text = ''
 730
 731         return template % {'date': time.strftime('%Y-%m-%d'),
 732                            'isbn': isbn_text,
 733                            'license': self.license,
 734                            }
 735
 736
 737     def compose_end_matter(self):
 738         """create the markup for the end_matter inside cover.  If
 739         self.isbn is not set, the html will result in a pdf that
 740         spills onto two pages.
 741         """
 742         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 743
 744         d = {'css_url': self.css_url,
 745              'title': self.title
 746              }
 747
 748         if self.isbn:
 749             d['inside_cover_style'] = ''
 750         else:
 751             d['inside_cover_style'] = 'page-break-after: always'
 752
 753         return template % d
 754
 755
 756     def make_epub(self, use_cache=False):
 757         """Make an epub version of the book, using Mike McCabe's
 758         epub module for the Internet Archive."""
 759         ebook = ia_epub.Book(self.publish_file, content_dir='')
 760         def add_file(ID, filename, mediatype, content):
 761             ebook.add_content({'media-type': mediatype.encode('utf-8'),
 762                                'id': ID.encode('utf-8'),
 763                                'href': filename.encode('utf-8'),
 764                                }, content)
 765
 766         toc = self.info['TOC']
 767
 768         #manifest
 769         filemap = {} #map html to corresponding xhtml
 770         spinemap = {} #map IDs to multi-file chapters
 771         for ID in self.manifest:
 772             details = self.manifest[ID]
 773             log(ID, pformat(details))
 774             fn, mediatype = details['url'], details['mimetype']
 775             content = self.store.read(fn)
 776             if mediatype == 'text/html':
 777                 #convert to application/xhtml+xml, and perhaps split
 778                 c = EpubChapter(self.server, self.book, ID, content,
 779                                 use_cache=use_cache)
 780                 c.remove_bad_tags()
 781                 if fn[-5:] == '.html':
 782                     fnbase = fn[:-5]
 783                 else:
 784                     fnbase = fn
 785                 fnx = fnbase + '.xhtml'
 786                 mediatype = 'application/xhtml+xml'
 787
 788                 fragments = split_html(c.as_xhtml(),
 789                                        compressed_size=self.store.getinfo(fn).compress_size)
 790
 791                 #add the first one as if it is the whole thing (as it often is)
 792                 add_file(ID, fnx, mediatype, fragments[0])
 793                 filemap[fn] = fnx
 794                 if len(fragments) > 1:
 795                     spine_ids = [ID]
 796                     spinemap[ID] = spine_ids
 797                     #add any extras
 798                     for i in range(1, len(fragments)):
 799                         # XXX it is possible for duplicates if another
 800                         # file happens to have this name. Ignore for now
 801                         _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
 802                         spine_ids.append(_id)
 803                         add_file(_id,
 804                                  '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
 805                                  mediatype, fragments[i])
 806
 807             else:
 808                 add_file(ID, fn, mediatype, content)
 809
 810         #toc
 811         ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
 812         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 813
 814         #spine
 815         for ID in self.spine:
 816             if ID in spinemap:
 817                 for x in spinemap[ID]:
 818                     ebook.add_spine_item({'idref': x})
 819             else:
 820                 ebook.add_spine_item({'idref': ID})
 821
 822         #metadata -- no use of attributes (yet)
 823         # and fm: metadata disappears for now
 824         DCNS = config.DCNS
 825         DC = config.DC
 826         meta_info_items = []
 827         for ns, namespace in self.metadata.items():
 828             for keyword, schemes in namespace.items():
 829                 if ns:
 830                     keyword = '{%s}%s' % (ns, keyword)
 831                 for scheme, values in schemes.items():
 832                     for value in values:
 833                         item = {
 834                             'item': keyword,
 835                             'text': value,
 836                             }
 837                         if scheme:
 838                             if keyword in (DCNS + 'creator', DCNS + 'contributor'):
 839                                 item['atts'] = {'role': scheme}
 840                             else:
 841                                 item['atts'] = {'scheme': scheme}
 842
 843         has_authors = 'creator' in self.metadata[DC]
 844         if not has_authors and config.CLAIM_UNAUTHORED:
 845             authors = []
 846             for x in self.metadata[DC]['creator'].values():
 847                 authors.extend(x)
 848
 849             meta_info_items.append({'item': DCNS + 'creator',
 850                                     'text': 'The Contributors'})
 851
 852             meta_info_items.append({'item': DCNS + 'rights',
 853                                     'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 854                                    )
 855
 856         tree_str = ia_epub.make_opf(meta_info_items,
 857                                     ebook.manifest_items,
 858                                     ebook.spine_items,
 859                                     ebook.guide_items,
 860                                     ebook.cover_id)
 861         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 862         ebook.z.close()
 863         self.notify_watcher()
 864
 865
 866     def publish_s3(self):
 867         """Push the book's epub to archive.org, using S3."""
 868         #XXX why only epub?
 869         secrets = {}
 870         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 871             fn = getattr(config, x)
 872             f = open(fn)
 873             secrets[x] = f.read().strip()
 874             f.close()
 875
 876         log(secrets)
 877         now = time.strftime('%F')
 878         s3output = self.filepath('s3-output.txt')
 879         s3url, detailsurl = find_archive_urls(self.book, self.bookname)
 880         headers = [
 881             'x-amz-auto-make-bucket:1',
 882             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
 883             'x-archive-meta-mediatype:texts',
 884             'x-archive-meta-collection:opensource',
 885             'x-archive-meta-title:%s' % (self.book,),
 886             'x-archive-meta-date:%s' % (now,),
 887             'x-archive-meta-creator:FLOSS Manuals Contributors',
 888             ]
 889
 890         if self.license in config.LICENSES:
 891             headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
 892
 893         argv = ['curl', '--location', '-s', '-o', s3output]
 894         for h in headers:
 895             argv.extend(('--header', h))
 896         argv.extend(('--upload-file', self.publish_file, s3url,))
 897
 898         log(' '.join(repr(x) for x in argv))
 899         check_call(argv, stdout=sys.stderr)
 900         self.notify_watcher()
 901         return detailsurl, s3url
 902
 903
 904     def spawn_x(self):
 905         """Start an Xvfb instance, using a new server number.  A
 906         reference to it is stored in self.xvfb, which is used to kill
 907         it when the pdf is done.
 908
 909         Note that Xvfb doesn't interact well with dbus which is
 910         present on modern desktops.
 911         """
 912         #Find an unused server number (in case two cgis are running at once)
 913         while True:
 914             servernum = random.randrange(50, 500)
 915             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 916                 break
 917
 918         self.xserver_no = ':%s' % servernum
 919
 920         authfile = self.filepath('Xauthority')
 921         os.environ['XAUTHORITY'] = authfile
 922
 923         #mcookie(1) eats into /dev/random, so avoid that
 924         from hashlib import md5
 925         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 926         mcookie = m.hexdigest()
 927
 928         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 929
 930         self.xvfb = Popen(['Xvfb', self.xserver_no,
 931                            '-screen', '0', '1024x768x24',
 932                            '-pixdepths', '32',
 933                            #'-blackpixel', '0',
 934                            #'-whitepixel', str(2 ** 24 -1),
 935                            #'+extension', 'Composite',
 936                            '-dpi', '96',
 937                            #'-kb',
 938                            '-nolisten', 'tcp',
 939                            ])
 940
 941         # We need to wait a bit before the Xvfb is ready.  but the
 942         # downloads are so slow that that probably doesn't matter
 943
 944         self.xvfb_ready_time = time.time() + 2
 945
 946         os.environ['DISPLAY'] = self.xserver_no
 947         log(self.xserver_no)
 948
 949     def wait_for_xvfb(self):
 950         """wait until a previously set time before continuing.  This
 951         is so Xvfb has time to properly start."""
 952         if hasattr(self, 'xvfb'):
 953             d = self.xvfb_ready_time - time.time()
 954             if d > 0:
 955                 time.sleep(d)
 956                 self.notify_watcher()
 957
 958     def cleanup_x(self):
 959         """Try very hard to kill off Xvfb.  In addition to killing
 960         this instance's xvfb, occasionally (randomly) search for
 961         escaped Xvfb instances and kill those too."""
 962         if not hasattr(self, 'xvfb'):
 963             return
 964         check_call(['xauth', 'remove', self.xserver_no])
 965         p = self.xvfb
 966         log("trying to kill Xvfb %s" % p.pid)
 967         os.kill(p.pid, 15)
 968         for i in range(10):
 969             if p.poll() is not None:
 970                 log("%s died with %s" % (p.pid, p.poll()))
 971                 break
 972             log("%s not dead yet" % p.pid)
 973             time.sleep(0.2)
 974         else:
 975             log("Xvfb would not die! kill -9! kill -9!")
 976             os.kill(p.pid, 9)
 977
 978         if random.random() < 0.1:
 979             # occasionally kill old xvfbs and soffices, if there are any.
 980             self.kill_old_processes()
 981
 982     def kill_old_processes(self):
 983         """Sometimes, despite everything, Xvfb or soffice instances
 984         hang around well after they are wanted -- for example if the
 985         cgi process dies particularly badly. So kill them if they have
 986         been running for a long time."""
 987         log("running kill_old_processes")
 988         killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
 989                                    os.path.basename(config.HTML2ODT),
 990                                    os.path.basename(config.WKHTMLTOPDF),
 991                                    ])
 992         p = Popen(['ps', '-C', killable_names,
 993                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 994         data = p.communicate()[0].strip()
 995         if data:
 996             lines = data.split('\n')
 997             pids = []
 998             for line in lines:
 999                 log('dealing with ps output "%s"' % line)
1000                 try:
1001                     pid, days, hours, minutes, seconds \
1002                          = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1003                 except AttributeError:
1004                     log("Couldn't parse that line!")
1005                 # 50 minutes should be enough xvfb time for anyone
1006                 if days or hours or int(minutes) > 50:
1007                     pid = int(pid)
1008                     log("going to kill pid %s" % pid)
1009                     os.kill(pid, 15)
1010                     pids.append(pid)
1011
1012             time.sleep(1.0)
1013             for pid in pids:
1014                 #try again in case any are lingerers
1015                 try:
1016                     os.kill(int(pid), 9)
1017                 except OSError, e:
1018                     log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1019                     continue
1020                 log('killing %s with -9' % pid)
1021         self.notify_watcher()
1022
1023     def cleanup(self):
1024         self.cleanup_x()
1025         if not config.KEEP_TEMP_FILES:
1026             for fn in os.listdir(self.workdir):
1027                 os.remove(os.path.join(self.workdir, fn))
1028             os.rmdir(self.workdir)
1029         else:
1030             log("NOT removing '%s', containing the following files:" % self.workdir)
1031             log(*os.listdir(self.workdir))
1032
1033         self.notify_watcher()
1034
1035
1036 def use_cache():
1037     return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1038
1039 def _read_cached_zip(server, book, max_age):
1040     #find a recent zip if possible
1041     prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1042     from glob import glob
1043     zips = sorted(glob(prefix + '*.zip'))
1044     if not zips:
1045         log("no cached booki-zips matching %s*.zip" % (prefix,))
1046         return None
1047     zipname = zips[-1]
1048     cutoff = time.time() - max_age * 60
1049     log(repr(zipname))
1050     try:
1051         date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1052         if date > cutoff:
1053             f = open(zipname)
1054             blob = f.read()
1055             f.close()
1056             return blob, zipname
1057         log("%s is too old, must reload" % zipname)
1058         return None
1059     except (IOError, IndexError, ValueError), e:
1060         log('could not make sense of %s: got exception %s' % (zipname, e))
1061         return None
1062
1063
1064 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1065     interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1066     try:
1067         url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1068                                             'server': server, 'book':book}
1069     except KeyError:
1070         raise NotImplementedError("Can't handle '%s' interface" % interface)
1071
1072     if use_cache() and max_age < 0:
1073         #default to 12 hours cache on objavi.halo.gen.nz
1074         max_age = 12 * 60
1075
1076     if max_age:
1077         log('WARNING: trying to use cached booki-zip',
1078             'If you are debugging booki-zip creation, you will go CRAZY'
1079             ' unless you switch this off')
1080         blob_and_name = _read_cached_zip(server, book, max_age)
1081         if blob_and_name is not None:
1082             return blob_and_name
1083
1084     log('fetching zip from %s'% url)
1085     f = urlopen(url)
1086     blob = f.read()
1087     f.close()
1088     if save:
1089         if filename is None:
1090             filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1091                                   make_book_name(book, server, '.zip'))
1092         f = open(filename, 'w')
1093         f.write(blob)
1094         f.close()
1095     return blob, filename
1096
1097
1098 def split_html(html, compressed_size=None, fix_markup=False):
1099     """Split long html files into pieces that will work nicely on a
1100     Sony Reader."""
1101     if compressed_size is None:
1102         import zlib
1103         compressed_size = len(zlib.compress(html))
1104
1105     splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1106                  len(html) // config.EPUB_FILE_SIZE_MAX)
1107     log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1108
1109     if not splits:
1110         return [html]
1111
1112     if fix_markup:
1113         #remove '<' in attributes etc, which makes the marker
1114         #insertion more reliable
1115         html = etree.tostring(lxml.html.fromstring(html),
1116                               encoding='UTF-8',
1117                               #method='html'
1118                               )
1119
1120     target = len(html) // (splits + 1)
1121     s = 0
1122     fragments = []
1123     for i in range(splits):
1124         e = html.find('<', target * (i + 1))
1125         fragments.append(html[s:e])
1126         fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1127         s = e
1128     fragments.append(html[s:])
1129
1130     #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1131     chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1132     return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]
1133