objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 from subprocess import Popen, check_call, PIPE
  28 from cStringIO import StringIO
  29 from urllib2 import urlopen, HTTPError
  30 import zipfile
  31 import traceback
  32 from string import ascii_letters
  33 from pprint import pformat
  34
  35 try:
  36     import simplejson as json
  37 except ImportError:
  38     import json
  39
  40 import lxml.html
  41 from lxml import etree
  42
  43 from objavi import config, epub_utils
  44 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
  45 from objavi.book_utils import ObjaviError, log_types
  46 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline, parse_extracted_outline
  47 from objavi.epub import add_guts, _find_tag
  48 from objavi.xhtml_utils import EpubChapter, split_tree
  49 from objavi.cgi_utils import url2path
  50
  51 from iarchive import epub as ia_epub
  52 from booki.bookizip import get_metadata, add_metadata
  53
  54 TMPDIR = os.path.abspath(config.TMPDIR)
  55 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
  56 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  57
  58 def find_archive_urls(bookid, bookname):
  59     s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
  60     detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
  61     return (s3url, detailsurl)
  62
  63 def _get_best_title(tocpoint):
  64     if 'html_title' in tocpoint:
  65         return tocpoint['html_title']
  66     if 'title' in tocpoint:
  67         return tocpoint['title']
  68     return 'Untitled'
  69
  70
  71 def _add_initial_number(e, n):
  72     """Put a styled chapter number n at the beginning of element e."""
  73     initial = e.makeelement("strong", Class="initial")
  74     e.insert(0, initial)
  75     initial.tail = ' '
  76     if e.text is not None:
  77         initial.tail += e.text
  78     e.text = ''
  79     initial.text = "%s." % n
  80
  81 def expand_toc(toc, depth=1, index=0):
  82     """Reformat toc slightly for convenience"""
  83     for item in toc:
  84         url = item['url'].lstrip('/')
  85         bits = url.split('#', 1)
  86         filename = bits[0]
  87         fragment = (bits[1] if len(bits) == 2 else None)
  88         item['depth'] = depth
  89         item["filename"] = filename
  90         item["fragment"] = fragment
  91         item["index"] = index
  92         index += 1
  93         if 'children' in item:
  94             index = expand_toc(item['children'], depth + 1, index)
  95     return index
  96
  97 def _serialise(rtoc, stoc, depth):
  98     for item in rtoc:
  99         url = item['url'].lstrip('/')
 100         bits = url.split('#', 1)
 101         filename = bits[0]
 102         fragment = (bits[1] if len(bits) == 2 else None)
 103         stoc.append({"depth": depth,
 104                      "title": item['title'],
 105                      "url": url,
 106                      "filename": filename,
 107                      "fragment": fragment,
 108                      "type": item['type']
 109                      })
 110         if 'children' in item:
 111             _serialise(item['children'], stoc, depth + 1)
 112
 113
 114 def serialise_toc(rtoc):
 115     """Take the recursive TOC structure and turn it into a list of
 116     serial points.  Reformat some things for convenience."""
 117     stoc = []
 118     _serialise(rtoc, stoc, 1)
 119     for i, x in enumerate(stoc):
 120         x['position'] = i
 121     return stoc
 122
 123 def filename_toc_map(rtoc):
 124     tocmap = {}
 125     #log(rtoc)
 126     def traverse(toc):
 127         for point in toc:
 128             #log(point.keys())
 129             tocmap.setdefault(point['filename'], []).append(point)
 130             if 'children' in point:
 131                 traverse(point['children'])
 132     traverse(rtoc)
 133     return tocmap
 134
 135 def save_data(fn, data):
 136     """Save without tripping up on unicode"""
 137     if isinstance(data, unicode):
 138         data = data.encode('utf8', 'ignore')
 139     f = open(fn, 'w')
 140     f.write(data)
 141     f.close()
 142
 143
 144 class Book(object):
 145     page_numbers = 'latin'
 146     preamble_page_numbers = 'roman'
 147
 148     def notify_watcher(self, message=None):
 149         if self.watchers:
 150             if  message is None:
 151                 #message is the name of the caller
 152                 message = traceback.extract_stack(None, 2)[0][2]
 153             log("notify_watcher called with '%s'" % message)
 154             for w in self.watchers:
 155                 w(message)
 156
 157     def __enter__(self):
 158         return self
 159
 160     def __exit__(self, exc_type, exc_value, tb):
 161         self.notify_watcher(config.FINISHED_MESSAGE)
 162         self.cleanup()
 163         #could deal with exceptions here and return true
 164
 165
 166     def __init__(self, book, server, bookname,
 167                  page_settings=None, watchers=None, isbn=None,
 168                  license=config.DEFAULT_LICENSE, title=None,
 169                  max_age=0):
 170         log("*** Starting new book %s ***" % bookname)
 171         self.watchers = set()
 172         if watchers is not None:
 173             self.watchers.update(watchers)
 174         self.notify_watcher('start')
 175         self.bookname = bookname
 176         self.book = book
 177         self.server = server
 178         self.cookie = ''.join(random.sample(ascii_letters, 10))
 179         try:
 180             blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
 181         except HTTPError, e:
 182             traceback.print_exc()
 183             self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
 184             #not much to do?
 185             #raise 502 Bad Gateway ?
 186             sys.exit()
 187         f = StringIO(blob)
 188         self.notify_watcher('fetch_zip')
 189         self.store = zipfile.ZipFile(f, 'r')
 190         self.info = json.loads(self.store.read('info.json'))
 191         for k in ('manifest', 'metadata', 'spine', 'TOC'):
 192             if k not in self.info:
 193                 raise ObjaviError('info.json of %s lacks vital element "%s"' %
 194                                   (bookname, k))
 195             #check types also?
 196
 197         self.metadata = self.info['metadata']
 198         self.spine = self.info['spine']
 199         self.manifest = self.info['manifest']
 200
 201         if server == config.LOCALHOST: # [DEPRECATED]
 202             server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
 203             book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
 204
 205         log(pformat(self.metadata))
 206         self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
 207         if not self.lang:
 208             self.lang = guess_lang(server, book)
 209             log('guessed lang as %s' % self.lang)
 210
 211         self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
 212         if not self.toc_header:
 213             self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
 214
 215         self.dir = str(get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0])
 216         if not self.dir:
 217             self.dir = guess_text_dir(server, book)
 218
 219         #Patch in the extra metadata. (lang and dir may be set from config)
 220         #these should be read from zip -- so should go into zip?
 221         for var, key, scheme, ns in (
 222             (isbn, 'id', 'ISBN', config.DC),
 223             (license, 'rights', 'License', config.DC),
 224             (title, 'title', '', config.DC),
 225             (self.lang, 'language', '', config.DC),
 226             (self.dir, 'dir', '', config.FM),
 227             ):
 228             if var is not None:
 229                 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
 230
 231         self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
 232         self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
 233
 234         self.toc = self.info['TOC']
 235         expand_toc(self.toc)
 236
 237         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 238         os.chmod(self.workdir, 0755)
 239
 240         self.body_html_file = self.filepath('body.html')
 241         self.body_pdf_file = self.filepath('body.pdf')
 242         self.preamble_html_file = self.filepath('preamble.html')
 243         self.preamble_pdf_file = self.filepath('preamble.pdf')
 244         self.tail_html_file = self.filepath('tail.html')
 245         self.tail_pdf_file = self.filepath('tail.pdf')
 246         self.isbn_pdf_file = None
 247         self.pdf_file = self.filepath('final.pdf')
 248         self.body_odt_file = self.filepath('body.odt')
 249         self.outline_file = self.filepath('outline.txt')
 250
 251         self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
 252
 253         if page_settings is not None:
 254             self.maker = PageSettings(**page_settings)
 255
 256         if title is not None:
 257             self.title = title
 258         else:
 259             titles = get_metadata(self.metadata, 'title')
 260             if titles:
 261                 self.title = titles[0]
 262             else:
 263                 self.title = 'A Book About ' + self.book
 264
 265         self.notify_watcher()
 266
 267
 268     if config.TRY_BOOK_CLEANUP_ON_DEL:
 269         #Dont even define __del__ if it is not used.
 270         _try_cleanup_on_del = True
 271         def __del__(self):
 272             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 273                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 274                 self.cleanup()
 275
 276     def get_tree_by_id(self, id):
 277         """get an HTML tree from the given manifest ID"""
 278         name = self.manifest[id]['url']
 279         mimetype = self.manifest[id]['mimetype']
 280         s = self.store.read(name)
 281         f = StringIO(s)
 282         if mimetype == 'text/html':
 283             try:
 284                 tree = lxml.html.parse(f)
 285             except etree.XMLSyntaxError, e:
 286                 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
 287                     (id, name, s[:20], e))
 288                 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 289         elif 'xml' in mimetype: #XXX or is this just asking for trouble?
 290             tree = etree.parse(f)
 291         else:
 292             tree = f.read()
 293         f.close()
 294         return tree
 295
 296     def filepath(self, fn):
 297         return os.path.join(self.workdir, fn)
 298
 299     def save_tempfile(self, fn, data):
 300         """Save the data in a temporary directory that will be cleaned
 301         up when all is done.  Return the absolute file path."""
 302         fn = self.filepath(fn)
 303         save_data(fn, data)
 304         return fn
 305
 306     def make_oo_doc(self):
 307         """Make an openoffice document, using the html2odt script."""
 308         self.wait_for_xvfb()
 309         html_text = etree.tostring(self.tree, method="html")
 310         save_data(self.body_html_file, html_text)
 311         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 312         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 313         os.rename(self.body_odt_file, self.publish_file)
 314         self.notify_watcher()
 315
 316     def extract_pdf_outline(self):
 317         """Get the outline (table of contents) for the PDF, which
 318         wkhtmltopdf should have written to a file.  If that file
 319         doesn't exist (or config says not to use it), fall back to
 320         using self._extract_pdf_outline_the_old_way, below.
 321         """
 322         if config.USE_DUMP_OUTLINE:
 323             try:
 324                 self.outline_contents, number_of_pages = \
 325                                        parse_extracted_outline(self.outline_file)
 326
 327             except Exception, e:
 328                 traceback.print_exc()
 329                 number_of_pages = self._extract_pdf_outline_the_old_way()
 330         else:
 331             number_of_pages = self._extract_pdf_outline_the_old_way()
 332
 333         self.notify_watcher()
 334         return number_of_pages
 335
 336     def _extract_pdf_outline_the_old_way(self):
 337         """Try to get the PDF outline using pdftk.  This doesn't work
 338         well with all scripts."""
 339         debugf = self.filepath('extracted-outline.txt')
 340         self.outline_contents, number_of_pages = \
 341                 parse_outline(self.body_pdf_file, 1, debugf)
 342
 343         if not self.outline_contents:
 344             #probably problems with international text. need a horrible hack
 345             log('no outline: trying again with ascii headings')
 346             import copy
 347             tree = copy.deepcopy(self.tree)
 348             titlemap = {}
 349             for tag in ('h1', 'h2', 'h3', 'h4'):
 350                 for i, e in enumerate(tree.getiterator(tag)):
 351                     key = "%s_%s" % (tag, i)
 352                     titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
 353                     del e[:]
 354                     if tag == 'h1':
 355                         e = lxml.etree.SubElement(e, "strong", Class="initial")
 356                     e.text = key
 357                     log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
 358
 359             ascii_html_file = self.filepath('body-ascii-headings.html')
 360             ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
 361             html_text = lxml.etree.tostring(tree, method="html")
 362             save_data(ascii_html_file, html_text)
 363             self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
 364             debugf = self.filepath('ascii-extracted-outline.txt')
 365             ascii_contents, number_of_ascii_pages = \
 366                 parse_outline(ascii_pdf_file, 1, debugf)
 367             self.outline_contents = []
 368             log ("number of pages: %s, post ascii: %s" %
 369                  (number_of_pages, number_of_ascii_pages))
 370             for ascii_title, depth, pageno in ascii_contents:
 371                 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
 372                     ascii_title = ascii_title[:-4]
 373                 if ' ' in ascii_title:
 374                     ascii_title = ascii_title.rsplit(' ', 1)[1]
 375                 title = titlemap.get(ascii_title, '')
 376                 log((ascii_title, title, depth, pageno))
 377
 378                 self.outline_contents.append((title, depth, pageno))
 379
 380         return number_of_pages
 381
 382     def make_body_pdf(self):
 383         """Make a pdf of the HTML, using webkit"""
 384         #1. Save the html
 385         html_text = etree.tostring(self.tree, method="html")
 386         save_data(self.body_html_file, html_text)
 387
 388         #2. Make a pdf of it
 389         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True, outline_file=self.outline_file)
 390         self.notify_watcher('generate_pdf')
 391
 392         n_pages = self.extract_pdf_outline()
 393
 394         log ("found %s pages in pdf" % n_pages)
 395         #4. resize pages, shift gutters, even pages
 396         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 397         self.notify_watcher('reshape_pdf')
 398
 399         #5 add page numbers
 400         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 401                               numbers=self.page_numbers)
 402         self.notify_watcher("number_pdf")
 403         self.notify_watcher()
 404
 405     def make_preamble_pdf(self):
 406         contents = self.make_contents()
 407         inside_cover_html = self.compose_inside_cover()
 408         log_types(self.dir, self.css_url, self.title, inside_cover_html,
 409                   self.toc_header, contents, self.title)
 410
 411         html = ('<html dir="%s"><head>\n'
 412                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 413                 '<link rel="stylesheet" href="%s" />\n'
 414                 '</head>\n<body>\n'
 415                 '<h1 class="frontpage">%s</h1>'
 416                 '%s\n'
 417                 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
 418                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 419                 '<!--%s--></div></body></html>'
 420                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 421                      self.toc_header, contents, self.title)
 422         save_data(self.preamble_html_file, html)
 423
 424         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 425
 426         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 427
 428         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 429                             numbers=self.preamble_page_numbers,
 430                             number_start=-2)
 431
 432         self.notify_watcher()
 433
 434     def make_end_matter_pdf(self):
 435         """Make an inside back cover and a back cover.  If there is an
 436         isbn number its barcode will be put on the back cover."""
 437         if self.isbn:
 438             self.isbn_pdf_file = self.filepath('isbn.pdf')
 439             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 440             self.notify_watcher('make_barcode_pdf')
 441
 442         end_matter = self.compose_end_matter()
 443         #log(end_matter)
 444         save_data(self.tail_html_file, end_matter.decode('utf-8'))
 445         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 446
 447         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 448                                centre_end=True, even_pages=False)
 449         self.notify_watcher()
 450
 451     def make_book_pdf(self):
 452         """A convenient wrapper of a few necessary steps"""
 453         # now the Xvfb server is needed. make sure it has had long enough to get going
 454         self.wait_for_xvfb()
 455         self.make_body_pdf()
 456         self.make_preamble_pdf()
 457         self.make_end_matter_pdf()
 458
 459         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 460                     self.body_pdf_file, self.tail_pdf_file,
 461                     self.isbn_pdf_file)
 462
 463         self.notify_watcher('concatenated_pdfs')
 464
 465
 466     def make_simple_pdf(self, mode):
 467         """Make a simple pdf document without contents or separate
 468         title page.  This is used for multicolumn newspapers and for
 469         web-destined pdfs."""
 470         self.wait_for_xvfb()
 471         #0. Add heading to begining of html
 472         body = list(self.tree.cssselect('body'))[0]
 473         e = body.makeelement('h1', {'id': 'book-title'})
 474         e.text = self.title
 475         body.insert(0, e)
 476         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 477         e.addnext(intro)
 478
 479         #0.5 adjust parameters to suit the particular kind of output
 480         if mode == 'web':
 481             self.maker.gutter = 0
 482
 483         #1. Save the html
 484         html_text = etree.tostring(self.tree, method="html")
 485         save_data(self.body_html_file, html_text)
 486
 487         #2. Make a pdf of it (direct to to final pdf)
 488         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True, outline_file=self.outline_file)
 489         self.notify_watcher('generate_pdf')
 490         n_pages = count_pdf_pages(self.pdf_file)
 491
 492         if mode != 'web':
 493             #3. resize pages and shift gutters.
 494             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 495             self.notify_watcher('reshape_pdf')
 496
 497             #4. add page numbers
 498             self.maker.number_pdf(self.pdf_file, n_pages,
 499                                   dir=self.dir, numbers=self.page_numbers)
 500             self.notify_watcher("number_pdf")
 501         self.notify_watcher()
 502
 503
 504     def rotate180(self):
 505         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 506         presses."""
 507         rotated = self.filepath('final-rotate.pdf')
 508         unrotated = self.filepath('final-pre-rotate.pdf')
 509         #leave the unrotated pdf intact at first, in case of error.
 510         rotate_pdf(self.pdf_file, rotated)
 511         os.rename(self.pdf_file, unrotated)
 512         os.rename(rotated, self.pdf_file)
 513         self.notify_watcher()
 514
 515     def publish_pdf(self):
 516         """Move the finished PDF to its final resting place"""
 517         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 518         os.rename(self.pdf_file, self.publish_file)
 519         self.notify_watcher()
 520
 521     def publish_bookizip(self):
 522         """Publish the bookizip.  For this, copy rather than move,
 523         because the bookizip might be used by further processing.  If
 524         possible, a hard link is created."""
 525         log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
 526         try:
 527             run(['cp', '-l', self.bookizip_file, self.publish_file])
 528         except OSError:
 529             run(['cp', self.bookizip_file, self.publish_file])
 530         self.notify_watcher()
 531
 532     def concat_html(self):
 533         """Join all the chapters together into one tree.  Keep the TOC
 534         up-to-date along the way."""
 535
 536         #each manifest item looks like:
 537         #{'contributors': []
 538         #'license': [],
 539         #'mimetype': '',
 540         #'rightsholders': []
 541         #'url': ''}
 542         doc = lxml.html.document_fromstring('<html><body></body></html>')
 543         tocmap = filename_toc_map(self.toc)
 544         for ID in self.spine:
 545             details = self.manifest[ID]
 546             #log(ID, pformat(details))
 547             # ACO MIJENJAO
 548             try:
 549                 root = self.get_tree_by_id(ID).getroot()
 550             except Exception, e:
 551                 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
 552                 continue
 553             #handle any TOC points in this file
 554             for point in tocmap[details['url']]:
 555                 #if the url has a #identifier, use it. Otherwise, make
 556                 #one up, using a hidden element at the beginning of
 557                 #the inserted document.
 558                 #XXX this will break if different files use the same ids
 559                 #XXX should either replace all, or replace selectively.
 560                 if point['fragment']:
 561                     fragment = point['fragment']
 562                 else:
 563                     body = _find_tag(root, 'body')
 564                     fragment = '%s_%s' % (self.cookie, point['index'])
 565                     #reuse first tag if it is suitable.
 566                     if (len(body) and
 567                         body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
 568                         if body[0].get('id') is None:
 569                             body[0].set('id', fragment)
 570                         else:
 571                             fragment = body[0].get('id')
 572                         #the chapter starts with a heading. that heading should be the chapter name.
 573                         if body[0].tag in ('h1', 'h2', 'h3'):
 574                             log('chapter has title "%s", found html title "%s"' %
 575                                 (point['title'], body[0].text_content()))
 576                             point['html_title'] = body[0].text_content()
 577                     else:
 578                         marker = body.makeelement('div', style="display:none",
 579                                                   id=fragment)
 580                         body.insert(0, marker)
 581                 point['html_id'] = fragment
 582
 583             add_guts(root, doc)
 584         return doc
 585
 586     def unpack_static(self):
 587         """Extract static files from the zip for the html to refer to."""
 588         static_files = [x['url'] for x in self.manifest.values()
 589                         if x['url'].startswith('static')]
 590         if static_files:
 591             os.mkdir(self.filepath('static'))
 592
 593         for name in static_files:
 594             s = self.store.read(name)
 595             f = open(self.filepath(name), 'w')
 596             f.write(s)
 597             f.close()
 598         self.notify_watcher()
 599
 600     def load_book(self):
 601         """"""
 602         #XXX concatenate the HTML to match how TWiki version worked.
 603         # This is perhaps foolishly early -- throwing away useful boundaries.
 604         self.unpack_static()
 605         self.tree = self.concat_html()
 606         self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
 607
 608         self.headings = [x for x in self.tree.cssselect('h1')]
 609         if self.headings:
 610             self.headings[0].set('class', "first-heading")
 611         for h1 in self.headings:
 612             h1.title = h1.text_content().strip()
 613         self.notify_watcher()
 614
 615     def make_contents(self):
 616         """Generate HTML containing the table of contents.  This can
 617         only be done after the main PDF has been made, because the
 618         page numbers are contained in the PDF outline."""
 619         header = '<table class="toc">\n'
 620         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 621                     '<td class="pagenumber">%s</td></tr>\n')
 622         empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
 623         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 624         footer = '\n</table>'
 625
 626         contents = []
 627
 628         chapter = 1
 629         page_num = 1
 630         #log(self.outline_contents)
 631         outline_contents = iter(self.outline_contents)
 632
 633         for section in self.toc:
 634             if not section.get('children'):
 635                 contents.append(empty_section_tmpl % section['title'])
 636                 continue
 637             contents.append(section_tmpl % section['title'])
 638
 639             for point in section['children']:
 640                 try:
 641                     level = 99
 642                     while level > 1:
 643                         h1_text, level, page_num = outline_contents.next()
 644                 except StopIteration:
 645                     log("contents data not found for %s. Stopping" % (point,))
 646                     break
 647                 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
 648                 chapter += 1
 649
 650         doc = header + '\n'.join(contents) + footer
 651         if isinstance(doc, unicode):
 652             doc = doc.encode('utf-8')
 653         self.notify_watcher()
 654         return doc
 655
 656     def add_section_titles(self):
 657         """Add any section heading pages that the TOC.txt file
 658         specifies.  These are sub-book, super-chapter groupings.
 659
 660         Also add initial numbers to chapters.
 661         """
 662         chapter = 1
 663         section = None
 664         #log(self.toc)
 665         for t in self.toc:
 666             #only top level sections get a subsection page,
 667             #and only if they have children.
 668             if t.get('children'):
 669                 section = self.tree.makeelement('div', Class="objavi-subsection")
 670                 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
 671                 heading.text = t['title']
 672                 for child in t['children']:
 673                     item = etree.SubElement(section, 'div', Class="objavi-chapter")
 674                     if 'html_title' in child:
 675                         item.text = child['html_title']
 676                         heading = self.tree.cssselect('#'+ child['html_id'])
 677                         if heading:
 678                             _add_initial_number(heading[0], chapter)
 679                     else:
 680                         item.text = child['title']
 681                     _add_initial_number(item, chapter)
 682                     log(item.text, debug='HTMLGEN')
 683                     chapter += 1
 684                 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
 685                 location = self.tree.cssselect('#'+ t['html_id'])[0]
 686                 location.addprevious(section)
 687
 688
 689         self.notify_watcher()
 690
 691
 692     def add_css(self, css=None, mode='book'):
 693         """If css looks like a url, use it as a stylesheet link.
 694         Otherwise it is the CSS itself, which is saved to a temporary file
 695         and linked to."""
 696         log("css is %r" % css)
 697         htmltree = self.tree
 698         if css is None or not css.strip():
 699             css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
 700             if css_default is None:
 701                 #guess from language -- this should come first
 702                 css_modes = config.LANGUAGE_CSS.get(self.lang,
 703                                                     config.LANGUAGE_CSS['en'])
 704                 css_default = css_modes.get(mode, css_modes[None])
 705             url = 'file://' + os.path.abspath(url2path(css_default))
 706         elif not re.match(r'^http://\S+$', css):
 707             fn = self.save_tempfile('objavi.css', css)
 708             url = 'file://' + fn
 709         else:
 710             url = css
 711
 712         #find the head -- it's probably first child but lets not assume.
 713         for child in htmltree:
 714             if child.tag == 'head':
 715                 head = child
 716                 break
 717         else:
 718             head = htmltree.makeelement('head')
 719             htmltree.insert(0, head)
 720
 721         link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 722         self.css_url = url
 723         self.notify_watcher()
 724         return url
 725
 726
 727     def _read_localised_template(self, template, fallbacks=['en']):
 728         """Try to get the template in the approriate language, otherwise in english."""
 729         for lang in [self.lang] + fallbacks:
 730             try:
 731                 fn = template % (lang)
 732                 f = open(fn)
 733                 break
 734             except IOError, e:
 735                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 736                 log(e)
 737         template = f.read()
 738         f.close()
 739         return template
 740
 741     def compose_inside_cover(self):
 742         """create the markup for the preamble inside cover."""
 743         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 744
 745         if self.isbn:
 746             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 747         else:
 748             isbn_text = ''
 749
 750         return template % {'date': time.strftime('%Y-%m-%d'),
 751                            'isbn': isbn_text,
 752                            'license': self.license,
 753                            }
 754
 755
 756     def compose_end_matter(self):
 757         """create the markup for the end_matter inside cover.  If
 758         self.isbn is not set, the html will result in a pdf that
 759         spills onto two pages.
 760         """
 761         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 762
 763         d = {'css_url': self.css_url,
 764              'title': self.title
 765              }
 766
 767         if self.isbn:
 768             d['inside_cover_style'] = ''
 769         else:
 770             d['inside_cover_style'] = 'page-break-after: always'
 771
 772         return template % d
 773
 774
 775     def make_epub(self, use_cache=False):
 776         """Make an epub version of the book, using Mike McCabe's
 777         epub module for the Internet Archive."""
 778         ebook = ia_epub.Book(self.publish_file, content_dir='')
 779         def add_file(ID, filename, mediatype, content):
 780             ebook.add_content({'media-type': mediatype.encode('utf-8'),
 781                                'id': ID.encode('utf-8'),
 782                                'href': filename.encode('utf-8'),
 783                                }, content)
 784
 785         toc = self.info['TOC']
 786
 787         #manifest
 788         filemap = {} #map html to corresponding xhtml
 789         spinemap = {} #map IDs to multi-file chapters
 790         for ID in self.manifest:
 791             details = self.manifest[ID]
 792             #log(ID, pformat(details))
 793             fn, mediatype = details['url'], details['mimetype']
 794             content = self.store.read(fn)
 795             if mediatype == 'text/html':
 796                 #convert to application/xhtml+xml, and perhaps split
 797                 c = EpubChapter(self.server, self.book, ID, content,
 798                                 use_cache=use_cache)
 799                 c.remove_bad_tags()
 800                 if fn[-5:] == '.html':
 801                     fnbase = fn[:-5]
 802                 else:
 803                     fnbase = fn
 804                 fnx = fnbase + '.xhtml'
 805                 mediatype = 'application/xhtml+xml'
 806
 807                 fragments = split_html(c.as_xhtml(),
 808                                        compressed_size=self.store.getinfo(fn).compress_size)
 809
 810                 #add the first one as if it is the whole thing (as it often is)
 811                 add_file(ID, fnx, mediatype, fragments[0])
 812                 filemap[fn] = fnx
 813                 if len(fragments) > 1:
 814                     spine_ids = [ID]
 815                     spinemap[ID] = spine_ids
 816                     #add any extras
 817                     for i in range(1, len(fragments)):
 818                         # XXX it is possible for duplicates if another
 819                         # file happens to have this name. Ignore for now
 820                         _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
 821                         spine_ids.append(_id)
 822                         add_file(_id,
 823                                  '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
 824                                  mediatype, fragments[i])
 825
 826             else:
 827                 add_file(ID, fn, mediatype, content)
 828
 829         #toc
 830         ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
 831         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 832
 833         #spine
 834         for ID in self.spine:
 835             if ID in spinemap:
 836                 for x in spinemap[ID]:
 837                     ebook.add_spine_item({'idref': x})
 838             else:
 839                 ebook.add_spine_item({'idref': ID})
 840
 841         #metadata -- no use of attributes (yet)
 842         # and fm: metadata disappears for now
 843         DCNS = config.DCNS
 844         DC = config.DC
 845         meta_info_items = []
 846         for ns, namespace in self.metadata.items():
 847             for keyword, schemes in namespace.items():
 848                 if ns:
 849                     keyword = '{%s}%s' % (ns, keyword)
 850                 for scheme, values in schemes.items():
 851                     for value in values:
 852                         item = {
 853                             'item': keyword,
 854                             'text': value,
 855                             }
 856                         if scheme:
 857                             if keyword in (DCNS + 'creator', DCNS + 'contributor'):
 858                                 item['atts'] = {'role': scheme}
 859                             else:
 860                                 item['atts'] = {'scheme': scheme}
 861
 862         has_authors = 'creator' in self.metadata[DC]
 863         if not has_authors and config.CLAIM_UNAUTHORED:
 864             authors = []
 865             for x in self.metadata[DC]['creator'].values():
 866                 authors.extend(x)
 867
 868             meta_info_items.append({'item': DCNS + 'creator',
 869                                     'text': 'The Contributors'})
 870
 871             meta_info_items.append({'item': DCNS + 'rights',
 872                                     'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 873                                    )
 874
 875         tree_str = ia_epub.make_opf(meta_info_items,
 876                                     ebook.manifest_items,
 877                                     ebook.spine_items,
 878                                     ebook.guide_items,
 879                                     ebook.cover_id)
 880         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 881         ebook.z.close()
 882         self.notify_watcher()
 883
 884
 885     def publish_s3(self):
 886         """Push the book's epub to archive.org, using S3."""
 887         #XXX why only epub?
 888         secrets = {}
 889         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 890             fn = getattr(config, x)
 891             f = open(fn)
 892             secrets[x] = f.read().strip()
 893             f.close()
 894
 895         now = time.strftime('%F')
 896         s3output = self.filepath('s3-output.txt')
 897         s3url, detailsurl = find_archive_urls(self.book, self.bookname)
 898         headers = [
 899             'x-amz-auto-make-bucket:1',
 900             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
 901             'x-archive-meta-mediatype:texts',
 902             'x-archive-meta-collection:opensource',
 903             'x-archive-meta-title:%s' % (self.book,),
 904             'x-archive-meta-date:%s' % (now,),
 905             'x-archive-meta-creator:FLOSS Manuals Contributors',
 906             ]
 907
 908         if self.license in config.LICENSES:
 909             headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
 910
 911         argv = ['curl', '--location', '-s', '-o', s3output]
 912         for h in headers:
 913             argv.extend(('--header', h))
 914         argv.extend(('--upload-file', self.publish_file, s3url,))
 915
 916         log(' '.join(repr(x) for x in argv))
 917         check_call(argv, stdout=sys.stderr)
 918         self.notify_watcher()
 919         return detailsurl, s3url
 920
 921
 922     def spawn_x(self):
 923         """Start an Xvfb instance, using a new server number.  A
 924         reference to it is stored in self.xvfb, which is used to kill
 925         it when the pdf is done.
 926
 927         Note that Xvfb doesn't interact well with dbus which is
 928         present on modern desktops.
 929         """
 930         #Find an unused server number (in case two cgis are running at once)
 931         while True:
 932             servernum = random.randrange(50, 500)
 933             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 934                 break
 935
 936         self.xserver_no = ':%s' % servernum
 937
 938         authfile = self.filepath('Xauthority')
 939         os.environ['XAUTHORITY'] = authfile
 940
 941         #mcookie(1) eats into /dev/random, so avoid that
 942         from hashlib import md5
 943         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 944         mcookie = m.hexdigest()
 945
 946         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 947
 948         self.xvfb = Popen(['Xvfb', self.xserver_no,
 949                            '-screen', '0', '1024x768x24',
 950                            '-pixdepths', '32',
 951                            #'-blackpixel', '0',
 952                            #'-whitepixel', str(2 ** 24 -1),
 953                            #'+extension', 'Composite',
 954                            '-dpi', '96',
 955                            #'-kb',
 956                            '-nolisten', 'tcp',
 957                            ])
 958
 959         # We need to wait a bit before the Xvfb is ready.  but the
 960         # downloads are so slow that that probably doesn't matter
 961
 962         self.xvfb_ready_time = time.time() + 2
 963
 964         os.environ['DISPLAY'] = self.xserver_no
 965         log(self.xserver_no)
 966
 967     def wait_for_xvfb(self):
 968         """wait until a previously set time before continuing.  This
 969         is so Xvfb has time to properly start."""
 970         if hasattr(self, 'xvfb'):
 971             d = self.xvfb_ready_time - time.time()
 972             if d > 0:
 973                 time.sleep(d)
 974                 self.notify_watcher()
 975
 976     def cleanup_x(self):
 977         """Try very hard to kill off Xvfb.  In addition to killing
 978         this instance's xvfb, occasionally (randomly) search for
 979         escaped Xvfb instances and kill those too."""
 980         if not hasattr(self, 'xvfb'):
 981             return
 982         check_call(['xauth', 'remove', self.xserver_no])
 983         p = self.xvfb
 984         log("trying to kill Xvfb %s" % p.pid)
 985         os.kill(p.pid, 15)
 986         for i in range(10):
 987             if p.poll() is not None:
 988                 log("%s died with %s" % (p.pid, p.poll()))
 989                 break
 990             log("%s not dead yet" % p.pid)
 991             time.sleep(0.2)
 992         else:
 993             log("Xvfb would not die! kill -9! kill -9!")
 994             try:
 995                 os.kill(p.pid, 9)
 996             except OSError, e:
 997                 log(e)
 998
 999         if random.random() < 0.1:
1000             # occasionally kill old xvfbs and soffices, if there are any.
1001             self.kill_old_processes()
1002
1003     def kill_old_processes(self):
1004         """Sometimes, despite everything, Xvfb or soffice instances
1005         hang around well after they are wanted -- for example if the
1006         cgi process dies particularly badly. So kill them if they have
1007         been running for a long time."""
1008         log("running kill_old_processes")
1009         killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1010                                    os.path.basename(config.HTML2ODT),
1011                                    os.path.basename(config.WKHTMLTOPDF),
1012                                    ])
1013         p = Popen(['ps', '-C', killable_names,
1014                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
1015         data = p.communicate()[0].strip()
1016         if data:
1017             lines = data.split('\n')
1018             pids = []
1019             for line in lines:
1020                 log('dealing with ps output "%s"' % line)
1021                 try:
1022                     pid, days, hours, minutes, seconds \
1023                          = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1024                 except AttributeError:
1025                     log("Couldn't parse that line!")
1026                 # 50 minutes should be enough xvfb time for anyone
1027                 if days or hours or int(minutes) > 50:
1028                     pid = int(pid)
1029                     log("going to kill pid %s" % pid)
1030                     os.kill(pid, 15)
1031                     pids.append(pid)
1032
1033             time.sleep(1.0)
1034             for pid in pids:
1035                 #try again in case any are lingerers
1036                 try:
1037                     os.kill(int(pid), 9)
1038                 except OSError, e:
1039                     log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1040                     continue
1041                 log('killing %s with -9' % pid)
1042         self.notify_watcher()
1043
1044     def cleanup(self):
1045         self.cleanup_x()
1046         if not config.KEEP_TEMP_FILES:
1047             for fn in os.listdir(self.workdir):
1048                 os.remove(os.path.join(self.workdir, fn))
1049             os.rmdir(self.workdir)
1050         else:
1051             log("NOT removing '%s', containing the following files:" % self.workdir)
1052             log(*os.listdir(self.workdir))
1053
1054         self.notify_watcher()
1055
1056
1057 def use_cache():
1058     return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1059
1060 def _read_cached_zip(server, book, max_age):
1061     #find a recent zip if possible
1062     prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1063     from glob import glob
1064     zips = sorted(glob(prefix + '*.zip'))
1065     if not zips:
1066         log("no cached booki-zips matching %s*.zip" % (prefix,))
1067         return None
1068     zipname = zips[-1]
1069     cutoff = time.time() - max_age * 60
1070     log(repr(zipname))
1071     try:
1072         date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1073         if date > cutoff:
1074             f = open(zipname)
1075             blob = f.read()
1076             f.close()
1077             return blob, zipname
1078         log("%s is too old, must reload" % zipname)
1079         return None
1080     except (IOError, IndexError, ValueError), e:
1081         log('could not make sense of %s: got exception %s' % (zipname, e))
1082         return None
1083
1084
1085 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1086     interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1087     try:
1088         url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1089                                             'server': server, 'book':book}
1090     except KeyError:
1091         raise NotImplementedError("Can't handle '%s' interface" % interface)
1092
1093     if use_cache() and max_age < 0:
1094         #default to 12 hours cache on objavi.halo.gen.nz
1095         max_age = 12 * 60
1096
1097     if max_age:
1098         log('WARNING: trying to use cached booki-zip',
1099             'If you are debugging booki-zip creation, you will go CRAZY'
1100             ' unless you switch this off')
1101         blob_and_name = _read_cached_zip(server, book, max_age)
1102         if blob_and_name is not None:
1103             return blob_and_name
1104
1105     log('fetching zip from %s'% url)
1106     f = urlopen(url)
1107     blob = f.read()
1108     f.close()
1109     if save:
1110         if filename is None:
1111             filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1112                                   make_book_name(book, server, '.zip'))
1113         f = open(filename, 'w')
1114         f.write(blob)
1115         f.close()
1116     return blob, filename
1117
1118
1119 def split_html(html, compressed_size=None, fix_markup=False):
1120     """Split long html files into pieces that will work nicely on a
1121     Sony Reader."""
1122     if compressed_size is None:
1123         import zlib
1124         compressed_size = len(zlib.compress(html))
1125
1126     splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1127                  len(html) // config.EPUB_FILE_SIZE_MAX)
1128     log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1129
1130     if not splits:
1131         return [html]
1132
1133     if fix_markup:
1134         #remove '<' in attributes etc, which makes the marker
1135         #insertion more reliable
1136         html = etree.tostring(lxml.html.fromstring(html),
1137                               encoding='UTF-8',
1138                               #method='html'
1139                               )
1140
1141     target = len(html) // (splits + 1)
1142     s = 0
1143     fragments = []
1144     for i in range(splits):
1145         e = html.find('<', target * (i + 1))
1146         fragments.append(html[s:e])
1147         fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1148         s = e
1149     fragments.append(html[s:])
1150
1151     #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1152     chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1153     return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]
1154