objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 from subprocess import Popen, check_call, PIPE
  28 from cStringIO import StringIO
  29 from urllib2 import urlopen, HTTPError
  30 import zipfile
  31 import traceback
  32 from string import ascii_letters
  33 from pprint import pformat
  34
  35 try:
  36     import simplejson as json
  37 except ImportError:
  38     import json
  39
  40 import lxml.html
  41 from lxml import etree
  42
  43 from objavi import config, epub_utils
  44 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
  45 from objavi.book_utils import ObjaviError
  46 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
  47 from objavi.epub import add_guts, _find_tag
  48 from objavi.xhtml_utils import EpubChapter, split_tree
  49
  50 from iarchive import epub as ia_epub
  51 from booki.bookizip import get_metadata, add_metadata
  52
  53 TMPDIR = os.path.abspath(config.TMPDIR)
  54 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  55 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  56 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  57
  58 def find_archive_urls(bookid, bookname):
  59     s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
  60     detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
  61     return (s3url, detailsurl)
  62
  63 def _get_best_title(tocpoint):
  64     if 'html_title' in tocpoint:
  65         return tocpoint['html_title']
  66     if 'title' in tocpoint:
  67         return tocpoint['title']
  68     return 'Untitled'
  69
  70
  71 def _add_initial_number(e, n):
  72     """Put a styled chapter number n at the beginning of element e."""
  73     initial = e.makeelement("strong", Class="initial")
  74     e.insert(0, initial)
  75     initial.tail = ' '
  76     if e.text is not None:
  77         initial.tail += e.text
  78     e.text = ''
  79     initial.text = "%s." % n
  80
  81 def expand_toc(toc, depth=1, index=0):
  82     """Reformat toc slightly for convenience"""
  83     for item in toc:
  84         url = item['url'].lstrip('/')
  85         bits = url.split('#', 1)
  86         filename = bits[0]
  87         fragment = (bits[1] if len(bits) == 2 else None)
  88         item['depth'] = depth
  89         item["filename"] = filename
  90         item["fragment"] = fragment
  91         item["index"] = index
  92         index += 1
  93         if 'children' in item:
  94             index = expand_toc(item['children'], depth + 1, index)
  95     return index
  96
  97 def _serialise(rtoc, stoc, depth):
  98     for item in rtoc:
  99         url = item['url'].lstrip('/')
 100         bits = url.split('#', 1)
 101         filename = bits[0]
 102         fragment = (bits[1] if len(bits) == 2 else None)
 103         stoc.append({"depth": depth,
 104                      "title": item['title'],
 105                      "url": url,
 106                      "filename": filename,
 107                      "fragment": fragment,
 108                      "type": item['type']
 109                      })
 110         if 'children' in item:
 111             _serialise(item['children'], stoc, depth + 1)
 112
 113
 114 def serialise_toc(rtoc):
 115     """Take the recursive TOC structure and turn it into a list of
 116     serial points.  Reformat some things for convenience."""
 117     stoc = []
 118     _serialise(rtoc, stoc, 1)
 119     for i, x in enumerate(stoc):
 120         x['position'] = i
 121     return stoc
 122
 123 def filename_toc_map(rtoc):
 124     tocmap = {}
 125     log(rtoc)
 126     def traverse(toc):
 127         for point in toc:
 128             log(point.keys())
 129             tocmap.setdefault(point['filename'], []).append(point)
 130             if 'children' in point:
 131                 traverse(point['children'])
 132     traverse(rtoc)
 133     return tocmap
 134
 135
 136 class Book(object):
 137     page_numbers = 'latin'
 138     preamble_page_numbers = 'roman'
 139
 140     def notify_watcher(self, message=None):
 141         if self.watchers:
 142             if  message is None:
 143                 #message is the name of the caller
 144                 message = traceback.extract_stack(None, 2)[0][2]
 145             log("notify_watcher called with '%s'" % message)
 146             for w in self.watchers:
 147                 w(message)
 148
 149     def __enter__(self):
 150         return self
 151
 152     def __exit__(self, exc_type, exc_value, traceback):
 153         self.notify_watcher(config.FINISHED_MESSAGE)
 154         self.cleanup()
 155         #could deal with exceptions here and return true
 156
 157
 158     def __init__(self, book, server, bookname,
 159                  page_settings=None, watchers=None, isbn=None,
 160                  license=config.DEFAULT_LICENSE, title=None,
 161                  max_age=0):
 162         log("*** Starting new book %s ***" % bookname)
 163         self.watchers = set()
 164         if watchers is not None:
 165             self.watchers.update(watchers)
 166         self.notify_watcher('start')
 167         self.bookname = bookname
 168         self.book = book
 169         self.server = server
 170         self.cookie = ''.join(random.sample(ascii_letters, 10))
 171         try:
 172             blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
 173         except HTTPError, e:
 174             traceback.print_exc()
 175             self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
 176             #not much to do?
 177             #raise 502 Bad Gateway ?
 178             sys.exit()
 179         f = StringIO(blob)
 180         self.notify_watcher('fetch_zip')
 181         self.store = zipfile.ZipFile(f, 'r')
 182         self.info = json.loads(self.store.read('info.json'))
 183         for k in ('manifest', 'metadata', 'spine', 'TOC'):
 184             if k not in self.info:
 185                 raise ObjaviError('info.json of %s lacks vital element "%s"' %
 186                                   (bookname, k))
 187             #check types also?
 188
 189         self.metadata = self.info['metadata']
 190         self.spine = self.info['spine']
 191         self.manifest = self.info['manifest']
 192
 193         if server == config.LOCALHOST: # [DEPRECATED]
 194             server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
 195             book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
 196
 197         log(pformat(self.metadata))
 198         self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
 199         if not self.lang:
 200             self.lang = guess_lang(server, book)
 201             log('guessed lang as %s' % self.lang)
 202
 203         self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
 204         if not self.toc_header:
 205             self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
 206
 207         self.dir = get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0]
 208         if not self.dir:
 209             self.dir = guess_text_dir(server, book)
 210
 211
 212         #Patch in the extra metadata. (lang and dir may be set from config)
 213         #these should be read from zip -- so should go into zip?
 214         for var, key, scheme, ns in (
 215             (isbn, 'id', 'ISBN', config.DC),
 216             (license, 'rights', 'License', config.DC),
 217             (title, 'title', '', config.DC),
 218             (self.lang, 'language', '', config.DC),
 219             (self.dir, 'dir', '', config.FM),
 220             ):
 221             if var is not None:
 222                 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
 223
 224         self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
 225         self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
 226
 227         self.toc = self.info['TOC']
 228         expand_toc(self.toc)
 229
 230         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 231         os.chmod(self.workdir, 0755)
 232
 233         self.body_html_file = self.filepath('body.html')
 234         self.body_pdf_file = self.filepath('body.pdf')
 235         self.preamble_html_file = self.filepath('preamble.html')
 236         self.preamble_pdf_file = self.filepath('preamble.pdf')
 237         self.tail_html_file = self.filepath('tail.html')
 238         self.tail_pdf_file = self.filepath('tail.pdf')
 239         self.isbn_pdf_file = None
 240         self.pdf_file = self.filepath('final.pdf')
 241         self.body_odt_file = self.filepath('body.odt')
 242
 243         self.publish_file = os.path.join(PUBLISH_PATH, bookname)
 244         self.publish_url = os.path.join(config.PUBLISH_URL, bookname)
 245
 246         if page_settings is not None:
 247             self.maker = PageSettings(**page_settings)
 248
 249         if title is not None:
 250             self.title = title
 251         else:
 252             titles = get_metadata(self.metadata, 'title')
 253             if titles:
 254                 self.title = titles[0]
 255             else:
 256                 self.title = 'A Book About ' + self.book
 257
 258         self.notify_watcher()
 259
 260
 261     if config.TRY_BOOK_CLEANUP_ON_DEL:
 262         #Dont even define __del__ if it is not used.
 263         _try_cleanup_on_del = True
 264         def __del__(self):
 265             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 266                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 267                 self.cleanup()
 268
 269     def get_tree_by_id(self, id):
 270         """get an HTML tree from the given manifest ID"""
 271         name = self.manifest[id]['url']
 272         mimetype = self.manifest[id]['mimetype']
 273         s = self.store.read(name)
 274         f = StringIO(s)
 275         if mimetype == 'text/html':
 276             try:
 277                 tree = lxml.html.parse(f)
 278             except etree.XMLSyntaxError, e:
 279                 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
 280                     (id, name, s[:20], e))
 281                 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 282         elif 'xml' in mimetype: #XXX or is this just asking for trouble?
 283             tree = etree.parse(f)
 284         else:
 285             tree = f.read()
 286         f.close()
 287         return tree
 288
 289     def filepath(self, fn):
 290         return os.path.join(self.workdir, fn)
 291
 292     def save_data(self, fn, data):
 293         """Save without tripping up on unicode"""
 294         if isinstance(data, unicode):
 295             data = data.encode('utf8', 'ignore')
 296         f = open(fn, 'w')
 297         f.write(data)
 298         f.close()
 299
 300     def save_tempfile(self, fn, data):
 301         """Save the data in a temporary directory that will be cleaned
 302         up when all is done.  Return the absolute file path."""
 303         fn = self.filepath(fn)
 304         self.save_data(fn, data)
 305         return fn
 306
 307     def make_oo_doc(self):
 308         """Make an openoffice document, using the html2odt script."""
 309         self.wait_for_xvfb()
 310         html_text = etree.tostring(self.tree, method="html")
 311         self.save_data(self.body_html_file, html_text)
 312         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 313         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 314         os.rename(self.body_odt_file, self.publish_file)
 315         self.notify_watcher()
 316
 317     def extract_pdf_outline(self):
 318         #self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
 319         debugf = self.filepath('outline.txt')
 320         self.outline_contents, self.outline_text, number_of_pages = \
 321                 parse_outline(self.body_pdf_file, 1, debugf)
 322
 323         if not self.outline_contents:
 324             #probably problems with international text. need a horrible hack
 325             log('no outline: trying again with ascii headings')
 326             import copy
 327             tree = copy.deepcopy(self.tree)
 328             titlemap = {}
 329             for tag in ('h1', 'h2', 'h3', 'h4'):
 330                 for i, e in enumerate(tree.getiterator(tag)):
 331                     key = "%s_%s" % (tag, i)
 332                     titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
 333                     del e[:]
 334                     if tag == 'h1':
 335                         e = lxml.etree.SubElement(e, "strong", Class="initial")
 336                     e.text = key
 337                     log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
 338
 339             ascii_html_file = self.filepath('body-ascii-headings.html')
 340             ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
 341             html_text = lxml.etree.tostring(tree, method="html")
 342             self.save_data(ascii_html_file, html_text)
 343             self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
 344             debugf = self.filepath('ascii_outline.txt')
 345             ascii_contents, ascii_text, number_of_ascii_pages = \
 346                 parse_outline(ascii_pdf_file, 1, debugf)
 347             self.outline_contents = []
 348             log ("number of pages: %s, post ascii: %s" %
 349                  (number_of_pages, number_of_ascii_pages))
 350             for ascii_title, depth, pageno in ascii_contents:
 351                 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
 352                     ascii_title = ascii_title[:-4]
 353                 if ' ' in ascii_title:
 354                     ascii_title = ascii_title.rsplit(' ', 1)[1]
 355                 title = titlemap.get(ascii_title, '')
 356                 log((ascii_title, title, depth, pageno))
 357
 358                 self.outline_contents.append((title, depth, pageno))
 359         else:
 360             for x in self.outline_contents:
 361                 log(x)
 362
 363         self.notify_watcher()
 364         return number_of_pages
 365
 366     def make_body_pdf(self):
 367         """Make a pdf of the HTML, using webkit"""
 368         #1. Save the html
 369         html_text = etree.tostring(self.tree, method="html")
 370         self.save_data(self.body_html_file, html_text)
 371
 372         #2. Make a pdf of it
 373         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
 374         self.notify_watcher('generate_pdf')
 375
 376         n_pages = self.extract_pdf_outline()
 377
 378         log ("found %s pages in pdf" % n_pages)
 379         #4. resize pages, shift gutters, even pages
 380         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 381         self.notify_watcher('reshape_pdf')
 382
 383         #5 add page numbers
 384         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 385                               numbers=self.page_numbers)
 386         self.notify_watcher("number_pdf")
 387         self.notify_watcher()
 388
 389     def make_preamble_pdf(self):
 390         contents = self.make_contents()
 391         inside_cover_html = self.compose_inside_cover()
 392         log(self.dir, self.css_url, self.title, inside_cover_html,
 393             self.toc_header, contents, self.title)
 394
 395         html = ('<html dir="%s"><head>\n'
 396                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 397                 '<link rel="stylesheet" href="%s" />\n'
 398                 '</head>\n<body>\n'
 399                 '<h1 class="frontpage">%s</h1>'
 400                 '%s\n'
 401                 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
 402                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 403                 '<!--%s--></div></body></html>'
 404                 ) % (self.dir, self.css_url, self.title, inside_cover_html.decode('utf-8'),
 405                      self.toc_header, contents, self.title)
 406         self.save_data(self.preamble_html_file, html)
 407
 408         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 409
 410         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 411
 412         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 413                             numbers=self.preamble_page_numbers,
 414                             number_start=-2)
 415
 416         self.notify_watcher()
 417
 418     def make_end_matter_pdf(self):
 419         """Make an inside back cover and a back cover.  If there is an
 420         isbn number its barcode will be put on the back cover."""
 421         if self.isbn:
 422             self.isbn_pdf_file = self.filepath('isbn.pdf')
 423             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 424             self.notify_watcher('make_barcode_pdf')
 425
 426         end_matter = self.compose_end_matter()
 427         log(end_matter)
 428         self.save_data(self.tail_html_file, end_matter.decode('utf-8'))
 429         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 430
 431         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 432                                centre_end=True, even_pages=False)
 433         self.notify_watcher()
 434
 435     def make_book_pdf(self):
 436         """A convenient wrapper of a few necessary steps"""
 437         # now the Xvfb server is needed. make sure it has had long enough to get going
 438         self.wait_for_xvfb()
 439         self.make_body_pdf()
 440         self.make_preamble_pdf()
 441         self.make_end_matter_pdf()
 442
 443         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 444                     self.body_pdf_file, self.tail_pdf_file,
 445                     self.isbn_pdf_file)
 446
 447         self.notify_watcher('concatenated_pdfs')
 448
 449
 450     def make_simple_pdf(self, mode):
 451         """Make a simple pdf document without contents or separate
 452         title page.  This is used for multicolumn newspapers and for
 453         web-destined pdfs."""
 454         self.wait_for_xvfb()
 455         #0. Add heading to begining of html
 456         body = list(self.tree.cssselect('body'))[0]
 457         e = body.makeelement('h1', {'id': 'book-title'})
 458         e.text = self.title
 459         body.insert(0, e)
 460         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 461         e.addnext(intro)
 462
 463         #0.5 adjust parameters to suit the particular kind of output
 464         if mode == 'web':
 465             self.maker.gutter = 0
 466
 467         #1. Save the html
 468         html_text = etree.tostring(self.tree, method="html")
 469         self.save_data(self.body_html_file, html_text)
 470
 471         #2. Make a pdf of it (direct to to final pdf)
 472         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
 473         self.notify_watcher('generate_pdf')
 474         n_pages = count_pdf_pages(self.pdf_file)
 475
 476         if mode != 'web':
 477             #3. resize pages and shift gutters.
 478             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 479             self.notify_watcher('reshape_pdf')
 480
 481             #4. add page numbers
 482             self.maker.number_pdf(self.pdf_file, n_pages,
 483                                   dir=self.dir, numbers=self.page_numbers)
 484             self.notify_watcher("number_pdf")
 485         self.notify_watcher()
 486
 487
 488     def rotate180(self):
 489         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 490         presses."""
 491         rotated = self.filepath('final-rotate.pdf')
 492         unrotated = self.filepath('final-pre-rotate.pdf')
 493         #leave the unrotated pdf intact at first, in case of error.
 494         rotate_pdf(self.pdf_file, rotated)
 495         os.rename(self.pdf_file, unrotated)
 496         os.rename(rotated, self.pdf_file)
 497         self.notify_watcher()
 498
 499     def publish_pdf(self):
 500         """Move the finished PDF to its final resting place"""
 501         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 502         os.rename(self.pdf_file, self.publish_file)
 503         self.notify_watcher()
 504
 505     def publish_bookizip(self):
 506         """Publish the bookizip.  For this, copy rather than move,
 507         because the bookizip might be used by further processing.  If
 508         possible, a hard link is created."""
 509         log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
 510         try:
 511             run(['cp', '-l', self.bookizip_file, self.publish_file])
 512         except OSError, e:
 513             run(['cp', self.bookizip_file, self.publish_file])
 514         self.notify_watcher()
 515
 516     def concat_html(self):
 517         """Join all the chapters together into one tree.  Keep the TOC
 518         up-to-date along the way."""
 519
 520         #each manifest item looks like:
 521         #{'contributors': []
 522         #'license': [],
 523         #'mimetype': '',
 524         #'rightsholders': []
 525         #'url': ''}
 526         doc = lxml.html.document_fromstring('<html><body></body></html>')
 527         tocmap = filename_toc_map(self.toc)
 528         for ID in self.spine:
 529             details = self.manifest[ID]
 530             log(ID, pformat(details))
 531             # ACO MIJENJAO
 532             try:
 533                 root = self.get_tree_by_id(ID).getroot()
 534             except:
 535                 continue
 536             #handle any TOC points in this file
 537             for point in tocmap[details['url']]:
 538                 #if the url has a #identifier, use it. Otherwise, make
 539                 #one up, using a hidden element at the beginning of
 540                 #the inserted document.
 541                 #XXX this will break if different files use the same ids
 542                 #XXX should either replace all, or replace selectively.
 543                 if point['fragment']:
 544                     fragment = point['fragment']
 545                 else:
 546                     body = _find_tag(root, 'body')
 547                     fragment = '%s_%s' % (self.cookie, point['index'])
 548                     #reuse first tag if it is suitable.
 549                     if (len(body) and
 550                         body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
 551                         if body[0].get('id') is None:
 552                             body[0].set('id', fragment)
 553                         else:
 554                             fragment = body[0].get('id')
 555                         #the chapter starts with a heading. that heading should be the chapter name.
 556                         if body[0].tag in ('h1', 'h2', 'h3'):
 557                             log('chapter has title "%s", found html title "%s"' %
 558                                 (point['title'], body[0].text_content()))
 559                             point['html_title'] = body[0].text_content()
 560                     else:
 561                         marker = body.makeelement('div', style="display:none",
 562                                                   id=fragment)
 563                         body.insert(0, marker)
 564                 point['html_id'] = fragment
 565
 566             add_guts(root, doc)
 567         return doc
 568
 569     def unpack_static(self):
 570         """Extract static files from the zip for the html to refer to."""
 571         static_files = [x['url'] for x in self.manifest.values()
 572                         if x['url'].startswith('static')]
 573         if static_files:
 574             os.mkdir(self.filepath('static'))
 575
 576         for name in static_files:
 577             s = self.store.read(name)
 578             f = open(self.filepath(name), 'w')
 579             f.write(s)
 580             f.close()
 581         self.notify_watcher()
 582
 583     def load_book(self):
 584         """"""
 585         #XXX concatenate the HTML to match how TWiki version worked.
 586         # This is perhaps foolishly early -- throwing away useful boundaries.
 587         self.unpack_static()
 588         self.tree = self.concat_html()
 589         self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
 590
 591         self.headings = [x for x in self.tree.cssselect('h1')]
 592         if self.headings:
 593             self.headings[0].set('class', "first-heading")
 594         for h1 in self.headings:
 595             h1.title = h1.text_content().strip()
 596         self.notify_watcher()
 597
 598     def make_contents(self):
 599         """Generate HTML containing the table of contents.  This can
 600         only be done after the main PDF has been made, because the
 601         page numbers are contained in the PDF outline."""
 602         header = '<h1>Table of Contents</h1><table class="toc">\n'
 603         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 604                     '<td class="pagenumber">%s</td></tr>\n')
 605         empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
 606         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 607         footer = '\n</table>'
 608
 609         contents = []
 610
 611         chapter = 1
 612         page_num = 1
 613         subsections = [] # for the subsection heading pages.
 614
 615         outline_contents = iter(self.outline_contents)
 616         headings = iter(self.headings)
 617
 618         for section in self.toc:
 619             if not section.get('children'):
 620                 contents.append(empty_section_tmpl % section['title'])
 621                 continue
 622             contents.append(section_tmpl % section['title'])
 623
 624             for point in section['children']:
 625                 try:
 626                     h1_text, level, page_num = outline_contents.next()
 627                 except StopIteration:
 628                     log("contents data not found for %s. Stopping" % (point,))
 629                     break
 630                 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
 631                 chapter += 1
 632
 633         doc = header + '\n'.join(contents) + footer
 634         self.notify_watcher()
 635         return doc
 636
 637     def add_section_titles(self):
 638         """Add any section heading pages that the TOC.txt file
 639         specifies.  These are sub-book, super-chapter groupings.
 640
 641         Also add initial numbers to chapters.
 642         """
 643         headings = iter(self.headings)
 644         chapter = 1
 645         section = None
 646         log(self.toc)
 647         for t in self.toc:
 648             #only top level sections get a subsection page,
 649             #and only if they have children.
 650             if t.get('children'):
 651                 section = self.tree.makeelement('div', Class="objavi-subsection")
 652                 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
 653                 heading.text = t['title']
 654                 for child in t['children']:
 655                     item = etree.SubElement(section, 'div', Class="objavi-chapter")
 656                     if 'html_title' in child:
 657                         item.text = child['html_title']
 658                         heading = self.tree.cssselect('#'+ child['html_id'])
 659                         if heading:
 660                             _add_initial_number(heading[0], chapter)
 661                     else:
 662                         item.text = child['title']
 663                     _add_initial_number(item, chapter)
 664                     log(item.text, debug='HTMLGEN')
 665                     chapter += 1
 666                 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
 667                 location = self.tree.cssselect('#'+ t['html_id'])[0]
 668                 location.addprevious(section)
 669
 670
 671         self.notify_watcher()
 672
 673
 674     def add_css(self, css=None, mode='book'):
 675         """If css looks like a url, use it as a stylesheet link.
 676         Otherwise it is the CSS itself, which is saved to a temporary file
 677         and linked to."""
 678         log("css is %r" % css)
 679         htmltree = self.tree
 680         if css is None or not css.strip():
 681             css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
 682             if css_default is None:
 683                 #guess from language -- this should come first
 684                 css_modes = config.LANGUAGE_CSS.get(self.lang,
 685                                                     config.LANGUAGE_CSS['en'])
 686                 css_default = css_modes.get(mode, css_modes[None])
 687             url = 'file://' + os.path.abspath(css_default)
 688         elif not re.match(r'^http://\S+$', css):
 689             fn = self.save_tempfile('objavi.css', css)
 690             url = 'file://' + fn
 691         else:
 692             url = css
 693         #XXX for debugging and perhaps sensible anyway
 694         #url = url.replace('file:///home/douglas/objavi2', '')
 695
 696
 697         #find the head -- it's probably first child but lets not assume.
 698         for child in htmltree:
 699             if child.tag == 'head':
 700                 head = child
 701                 break
 702         else:
 703             head = htmltree.makeelement('head')
 704             htmltree.insert(0, head)
 705
 706         link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 707         self.css_url = url
 708         self.notify_watcher()
 709         return url
 710
 711
 712     def _read_localised_template(self, template, fallbacks=['en']):
 713         """Try to get the template in the approriate language, otherwise in english."""
 714         for lang in [self.lang] + fallbacks:
 715             try:
 716                 fn = template % (lang)
 717                 f = open(fn)
 718                 break
 719             except IOError, e:
 720                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 721                 log(e)
 722         template = f.read()
 723         f.close()
 724         return template
 725
 726     def compose_inside_cover(self):
 727         """create the markup for the preamble inside cover."""
 728         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 729
 730         if self.isbn:
 731             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 732         else:
 733             isbn_text = ''
 734
 735         return template % {'date': time.strftime('%Y-%m-%d'),
 736                            'isbn': isbn_text,
 737                            'license': self.license,
 738                            }
 739
 740
 741     def compose_end_matter(self):
 742         """create the markup for the end_matter inside cover.  If
 743         self.isbn is not set, the html will result in a pdf that
 744         spills onto two pages.
 745         """
 746         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 747
 748         d = {'css_url': self.css_url,
 749              'title': self.title
 750              }
 751
 752         if self.isbn:
 753             d['inside_cover_style'] = ''
 754         else:
 755             d['inside_cover_style'] = 'page-break-after: always'
 756
 757         return template % d
 758
 759
 760     def make_epub(self, use_cache=False):
 761         """Make an epub version of the book, using Mike McCabe's
 762         epub module for the Internet Archive."""
 763         ebook = ia_epub.Book(self.publish_file, content_dir='')
 764         def add_file(ID, filename, mediatype, content):
 765             ebook.add_content({'media-type': mediatype.encode('utf-8'),
 766                                'id': ID.encode('utf-8'),
 767                                'href': filename.encode('utf-8'),
 768                                }, content)
 769
 770         toc = self.info['TOC']
 771
 772         #manifest
 773         filemap = {} #map html to corresponding xhtml
 774         spinemap = {} #map IDs to multi-file chapters
 775         for ID in self.manifest:
 776             details = self.manifest[ID]
 777             log(ID, pformat(details))
 778             fn, mediatype = details['url'], details['mimetype']
 779             content = self.store.read(fn)
 780             if mediatype == 'text/html':
 781                 #convert to application/xhtml+xml, and perhaps split
 782                 c = EpubChapter(self.server, self.book, ID, content,
 783                                 use_cache=use_cache)
 784                 c.remove_bad_tags()
 785                 if fn[-5:] == '.html':
 786                     fnbase = fn[:-5]
 787                 else:
 788                     fnbase = fn
 789                 fnx = fnbase + '.xhtml'
 790                 mediatype = 'application/xhtml+xml'
 791
 792                 fragments = split_html(c.as_xhtml(),
 793                                        compressed_size=self.store.getinfo(fn).compress_size)
 794
 795                 #add the first one as if it is the whole thing (as it often is)
 796                 add_file(ID, fnx, mediatype, fragments[0])
 797                 filemap[fn] = fnx
 798                 if len(fragments) > 1:
 799                     spine_ids = [ID]
 800                     spinemap[ID] = spine_ids
 801                     #add any extras
 802                     for i in range(1, len(fragments)):
 803                         # XXX it is possible for duplicates if another
 804                         # file happens to have this name. Ignore for now
 805                         _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
 806                         spine_ids.append(_id)
 807                         add_file(_id,
 808                                  '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
 809                                  mediatype, fragments[i])
 810
 811             else:
 812                 add_file(ID, fn, mediatype, content)
 813
 814         #toc
 815         ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
 816         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 817
 818         #spine
 819         for ID in self.spine:
 820             if ID in spinemap:
 821                 for x in spinemap[ID]:
 822                     ebook.add_spine_item({'idref': x})
 823             else:
 824                 ebook.add_spine_item({'idref': ID})
 825
 826         #metadata -- no use of attributes (yet)
 827         # and fm: metadata disappears for now
 828         DCNS = config.DCNS
 829         DC = config.DC
 830         meta_info_items = []
 831         for ns, namespace in self.metadata.items():
 832             for keyword, schemes in namespace.items():
 833                 if ns:
 834                     keyword = '{%s}%s' % (ns, keyword)
 835                 for scheme, values in schemes.items():
 836                     for value in values:
 837                         item = {
 838                             'item': keyword,
 839                             'text': value,
 840                             }
 841                         if scheme:
 842                             if keyword in (DCNS + 'creator', DCNS + 'contributor'):
 843                                 item['atts'] = {'role': scheme}
 844                             else:
 845                                 item['atts'] = {'scheme': scheme}
 846
 847         has_authors = 'creator' in self.metadata[DC]
 848         if not has_authors and config.CLAIM_UNAUTHORED:
 849             authors = []
 850             for x in self.metadata[DC]['creator'].values():
 851                 authors.extend(x)
 852
 853             meta_info_items.append({'item': DCNS + 'creator',
 854                                     'text': 'The Contributors'})
 855
 856             meta_info_items.append({'item': DCNS + 'rights',
 857                                     'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 858                                    )
 859
 860         tree_str = ia_epub.make_opf(meta_info_items,
 861                                     ebook.manifest_items,
 862                                     ebook.spine_items,
 863                                     ebook.guide_items,
 864                                     ebook.cover_id)
 865         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 866         ebook.z.close()
 867         self.notify_watcher()
 868
 869
 870     def publish_s3(self):
 871         """Push the book's epub to archive.org, using S3."""
 872         #XXX why only epub?
 873         secrets = {}
 874         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 875             fn = getattr(config, x)
 876             f = open(fn)
 877             secrets[x] = f.read().strip()
 878             f.close()
 879
 880         log(secrets)
 881         now = time.strftime('%F')
 882         s3output = self.filepath('s3-output.txt')
 883         s3url, detailsurl = find_archive_urls(self.book, self.bookname)
 884         headers = [
 885             'x-amz-auto-make-bucket:1',
 886             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
 887             'x-archive-meta-mediatype:texts',
 888             'x-archive-meta-collection:opensource',
 889             'x-archive-meta-title:%s' %(self.book,),
 890             'x-archive-meta-date:%s' % (now,),
 891             'x-archive-meta-creator:FLOSS Manuals Contributors',
 892             ]
 893
 894         if self.license in config.LICENSES:
 895             headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
 896
 897         argv = ['curl', '--location', '-s', '-o', s3output]
 898         for h in headers:
 899             argv.extend(('--header', h))
 900         argv.extend(('--upload-file', self.publish_file, s3url,))
 901
 902         log(' '.join(repr(x) for x in argv))
 903         check_call(argv, stdout=sys.stderr)
 904         self.notify_watcher()
 905         return detailsurl, s3url
 906
 907
 908     def spawn_x(self):
 909         """Start an Xvfb instance, using a new server number.  A
 910         reference to it is stored in self.xvfb, which is used to kill
 911         it when the pdf is done.
 912
 913         Note that Xvfb doesn't interact well with dbus which is
 914         present on modern desktops.
 915         """
 916         #Find an unused server number (in case two cgis are running at once)
 917         while True:
 918             servernum = random.randrange(50, 500)
 919             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 920                 break
 921
 922         self.xserver_no = ':%s' % servernum
 923
 924         authfile = self.filepath('Xauthority')
 925         os.environ['XAUTHORITY'] = authfile
 926
 927         #mcookie(1) eats into /dev/random, so avoid that
 928         from hashlib import md5
 929         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 930         mcookie = m.hexdigest()
 931
 932         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 933
 934         self.xvfb = Popen(['Xvfb', self.xserver_no,
 935                            '-screen', '0', '1024x768x24',
 936                            '-pixdepths', '32',
 937                            #'-blackpixel', '0',
 938                            #'-whitepixel', str(2 ** 24 -1),
 939                            #'+extension', 'Composite',
 940                            '-dpi', '96',
 941                            '-kb',
 942                            '-nolisten', 'tcp',
 943                            ])
 944
 945         # We need to wait a bit before the Xvfb is ready.  but the
 946         # downloads are so slow that that probably doesn't matter
 947
 948         self.xvfb_ready_time = time.time() + 2
 949
 950         os.environ['DISPLAY'] = self.xserver_no
 951         log(self.xserver_no)
 952
 953     def wait_for_xvfb(self):
 954         """wait until a previously set time before continuing.  This
 955         is so Xvfb has time to properly start."""
 956         if hasattr(self, 'xvfb'):
 957             d = self.xvfb_ready_time - time.time()
 958             if d > 0:
 959                 time.sleep(d)
 960                 self.notify_watcher()
 961
 962     def cleanup_x(self):
 963         """Try very hard to kill off Xvfb.  In addition to killing
 964         this instance's xvfb, occasionally (randomly) search for
 965         escaped Xvfb instances and kill those too."""
 966         if not hasattr(self, 'xvfb'):
 967             return
 968         check_call(['xauth', 'remove', self.xserver_no])
 969         p = self.xvfb
 970         log("trying to kill Xvfb %s" % p.pid)
 971         os.kill(p.pid, 15)
 972         for i in range(10):
 973             if p.poll() is not None:
 974                 log("%s died with %s" % (p.pid, p.poll()))
 975                 break
 976             log("%s not dead yet" % p.pid)
 977             time.sleep(0.2)
 978         else:
 979             log("Xvfb would not die! kill -9! kill -9!")
 980             os.kill(p.pid, 9)
 981
 982         if random.random() < 0.1:
 983             # occasionally kill old xvfbs and soffices, if there are any.
 984             self.kill_old_processes()
 985
 986     def kill_old_processes(self):
 987         """Sometimes, despite everything, Xvfb or soffice instances
 988         hang around well after they are wanted -- for example if the
 989         cgi process dies particularly badly. So kill them if they have
 990         been running for a long time."""
 991         log("running kill_old_processes")
 992         p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
 993                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 994         data = p.communicate()[0].strip()
 995         if data:
 996             lines = data.split('\n')
 997             pids = []
 998             for line in lines:
 999                 log('dealing with ps output "%s"' % line)
1000                 try:
1001                     pid, days, hours, minutes, seconds \
1002                          = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1003                 except AttributeError:
1004                     log("Couldn't parse that line!")
1005                 # 50 minutes should be enough xvfb time for anyone
1006                 if days or hours or int(minutes) > 50:
1007                     pid = int(pid)
1008                     log("going to kill pid %s" % pid)
1009                     os.kill(pid, 15)
1010                     pids.append(pid)
1011
1012             time.sleep(1.0)
1013             for pid in pids:
1014                 #try again in case any are lingerers
1015                 try:
1016                     os.kill(int(pid), 9)
1017                 except OSError, e:
1018                     log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1019                     continue
1020                 log('killing %s with -9' % pid)
1021         self.notify_watcher()
1022
1023     def cleanup(self):
1024         self.cleanup_x()
1025         if not config.KEEP_TEMP_FILES:
1026             for fn in os.listdir(self.workdir):
1027                 os.remove(os.path.join(self.workdir, fn))
1028             os.rmdir(self.workdir)
1029         else:
1030             log("NOT removing '%s', containing the following files:" % self.workdir)
1031             log(*os.listdir(self.workdir))
1032
1033         self.notify_watcher()
1034
1035
1036 def use_cache():
1037     return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1038
1039 def _read_cached_zip(server, book, max_age):
1040     #find a recent zip if possible
1041     prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1042     from glob import glob
1043     zips = sorted(glob(prefix + '*.zip'))
1044     if not zips:
1045         log("no cached booki-zips matching %s*.zip" % (prefix,))
1046         return None
1047     zipname = zips[-1]
1048     cutoff = time.time() - max_age * 60
1049     log(repr(zipname))
1050     try:
1051         date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1052         if date > cutoff:
1053             f = open(zipname)
1054             blob = f.read()
1055             f.close()
1056             return blob, zipname
1057         log("%s is too old, must reload" % zipname)
1058         return None
1059     except (IOError, IndexError, ValueError), e:
1060         log('could not make sense of %s: got exception %s' % (zipname, e))
1061         return None
1062
1063
1064 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1065     interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1066     try:
1067         url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1068                                             'server': server, 'book':book}
1069     except KeyError:
1070         raise NotImplementedError("Can't handle '%s' interface" % interface)
1071
1072     if use_cache() and max_age < 0:
1073         #default to 12 hours cache on objavi.halo.gen.nz
1074         max_age = 12 * 60
1075
1076     if max_age:
1077         log('WARNING: trying to use cached booki-zip',
1078             'If you are debugging booki-zip creation, you will go CRAZY'
1079             ' unless you switch this off')
1080         blob_and_name = _read_cached_zip(server, book, max_age)
1081         if blob_and_name is not None:
1082             return blob_and_name
1083
1084     log('fetching zip from %s'% url)
1085     f = urlopen(url)
1086     blob = f.read()
1087     f.close()
1088     if save:
1089         if filename is None:
1090             filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1091                                   make_book_name(book, server, '.zip'))
1092         f = open(filename, 'w')
1093         f.write(blob)
1094         f.close()
1095     return blob, filename
1096
1097
1098 def split_html(html, compressed_size=None, fix_markup=False):
1099     """Split long html files into pieces that will work nicely on a
1100     Sony Reader."""
1101     if compressed_size is None:
1102         import zlib
1103         compressed_size = len(zlib.compress(html))
1104
1105     splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1106                  len(html) // config.EPUB_FILE_SIZE_MAX)
1107     log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1108
1109     if not splits:
1110         return [html]
1111
1112     if fix_markup:
1113         #remove '<' in attributes etc, which makes the marker
1114         #insertion more reliable
1115         html = etree.tostring(lxml.html.fromstring(html),
1116                               encoding='UTF-8',
1117                               #method='html'
1118                               )
1119
1120     target = len(html) // (splits + 1)
1121     s = 0
1122     fragments = []
1123     for i in range(splits):
1124         e = html.find('<', target * (i + 1))
1125         fragments.append(html[s:e])
1126         fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1127         s = e
1128     fragments.append(html[s:])
1129
1130     #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1131     chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1132     return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]
1133