objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 from subprocess import Popen, check_call, PIPE
  28 from cStringIO import StringIO
  29 from urllib2 import urlopen, HTTPError
  30 import zipfile
  31 import traceback
  32 from string import ascii_letters
  33 from pprint import pformat
  34
  35 try:
  36     import simplejson as json
  37 except ImportError:
  38     import json
  39
  40 import lxml.html
  41 from lxml import etree
  42
  43 from objavi import config, epub_utils
  44 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
  45 from objavi.book_utils import ObjaviError, log_types
  46 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline, parse_extracted_outline
  47 from objavi.epub import add_guts, _find_tag
  48 from objavi.xhtml_utils import EpubChapter, split_tree
  49 from objavi.cgi_utils import url2path
  50
  51 from iarchive import epub as ia_epub
  52 from booki.bookizip import get_metadata, add_metadata
  53
  54 TMPDIR = os.path.abspath(config.TMPDIR)
  55 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
  56 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  57
  58 def find_archive_urls(bookid, bookname):
  59     s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
  60     detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
  61     return (s3url, detailsurl)
  62
  63 def _get_best_title(tocpoint):
  64     if 'html_title' in tocpoint:
  65         return tocpoint['html_title']
  66     if 'title' in tocpoint:
  67         return tocpoint['title']
  68     return 'Untitled'
  69
  70
  71 def _add_initial_number(e, n):
  72     """Put a styled chapter number n at the beginning of element e."""
  73     initial = e.makeelement("strong", Class="initial")
  74     e.insert(0, initial)
  75     initial.tail = ' '
  76     if e.text is not None:
  77         initial.tail += e.text
  78     e.text = ''
  79     initial.text = "%s." % n
  80
  81 def expand_toc(toc, depth=1, index=0):
  82     """Reformat toc slightly for convenience"""
  83     for item in toc:
  84         url = item['url'].lstrip('/')
  85         bits = url.split('#', 1)
  86         filename = bits[0]
  87         fragment = (bits[1] if len(bits) == 2 else None)
  88         item['depth'] = depth
  89         item["filename"] = filename
  90         item["fragment"] = fragment
  91         item["index"] = index
  92         index += 1
  93         if 'children' in item:
  94             index = expand_toc(item['children'], depth + 1, index)
  95     return index
  96
  97 def _serialise(rtoc, stoc, depth):
  98     for item in rtoc:
  99         url = item['url'].lstrip('/')
 100         bits = url.split('#', 1)
 101         filename = bits[0]
 102         fragment = (bits[1] if len(bits) == 2 else None)
 103         stoc.append({"depth": depth,
 104                      "title": item['title'],
 105                      "url": url,
 106                      "filename": filename,
 107                      "fragment": fragment,
 108                      "type": item['type']
 109                      })
 110         if 'children' in item:
 111             _serialise(item['children'], stoc, depth + 1)
 112
 113
 114 def serialise_toc(rtoc):
 115     """Take the recursive TOC structure and turn it into a list of
 116     serial points.  Reformat some things for convenience."""
 117     stoc = []
 118     _serialise(rtoc, stoc, 1)
 119     for i, x in enumerate(stoc):
 120         x['position'] = i
 121     return stoc
 122
 123 def filename_toc_map(rtoc):
 124     tocmap = {}
 125     #log(rtoc)
 126     def traverse(toc):
 127         for point in toc:
 128             #log(point.keys())
 129             tocmap.setdefault(point['filename'], []).append(point)
 130             if 'children' in point:
 131                 traverse(point['children'])
 132     traverse(rtoc)
 133     return tocmap
 134
 135 def save_data(fn, data):
 136     """Save without tripping up on unicode"""
 137     if isinstance(data, unicode):
 138         data = data.encode('utf8', 'ignore')
 139     f = open(fn, 'w')
 140     f.write(data)
 141     f.close()
 142
 143
 144 class Book(object):
 145     page_numbers = 'latin'
 146     preamble_page_numbers = 'roman'
 147
 148     def notify_watcher(self, message=None):
 149         if self.watchers:
 150             if  message is None:
 151                 #message is the name of the caller
 152                 message = traceback.extract_stack(None, 2)[0][2]
 153             log("notify_watcher called with '%s'" % message)
 154             for w in self.watchers:
 155                 w(message)
 156
 157     def __enter__(self):
 158         return self
 159
 160     def __exit__(self, exc_type, exc_value, tb):
 161         self.notify_watcher(config.FINISHED_MESSAGE)
 162         self.cleanup()
 163         #could deal with exceptions here and return true
 164
 165
 166     def __init__(self, book, server, bookname,
 167                  page_settings=None, watchers=None, isbn=None,
 168                  license=config.DEFAULT_LICENSE, title=None,
 169                  max_age=0):
 170         log("*** Starting new book %s ***" % bookname)
 171         self.watchers = set()
 172         if watchers is not None:
 173             self.watchers.update(watchers)
 174         self.notify_watcher('start')
 175         self.bookname = bookname
 176         self.book = book
 177         self.server = server
 178         self.cookie = ''.join(random.sample(ascii_letters, 10))
 179         try:
 180             blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
 181         except HTTPError, e:
 182             traceback.print_exc()
 183             self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
 184             #not much to do?
 185             #raise 502 Bad Gateway ?
 186             sys.exit()
 187         f = StringIO(blob)
 188         self.notify_watcher('fetch_zip')
 189         self.store = zipfile.ZipFile(f, 'r')
 190         self.info = json.loads(self.store.read('info.json'))
 191         for k in ('manifest', 'metadata', 'spine', 'TOC'):
 192             if k not in self.info:
 193                 raise ObjaviError('info.json of %s lacks vital element "%s"' %
 194                                   (bookname, k))
 195             #check types also?
 196
 197         self.metadata = self.info['metadata']
 198         self.spine = self.info['spine']
 199         self.manifest = self.info['manifest']
 200
 201         if server == config.LOCALHOST: # [DEPRECATED]
 202             server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
 203             book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
 204
 205         log(pformat(self.metadata))
 206         self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
 207         if not self.lang:
 208             self.lang = guess_lang(server, book)
 209             log('guessed lang as %s' % self.lang)
 210
 211         self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
 212         if not self.toc_header:
 213             self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
 214
 215         self.dir = str(get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0])
 216         if not self.dir:
 217             self.dir = guess_text_dir(server, book)
 218
 219         #Patch in the extra metadata. (lang and dir may be set from config)
 220         #these should be read from zip -- so should go into zip?
 221         for var, key, scheme, ns in (
 222             (isbn, 'id', 'ISBN', config.DC),
 223             (license, 'rights', 'License', config.DC),
 224             (title, 'title', '', config.DC),
 225             (self.lang, 'language', '', config.DC),
 226             (self.dir, 'dir', '', config.FM),
 227             ):
 228             if var is not None:
 229                 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
 230
 231         self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
 232         self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
 233
 234         self.toc = self.info['TOC']
 235         expand_toc(self.toc)
 236
 237         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 238         os.chmod(self.workdir, 0755)
 239
 240         self.body_html_file = self.filepath('body.html')
 241         self.body_pdf_file = self.filepath('body.pdf')
 242         self.preamble_html_file = self.filepath('preamble.html')
 243         self.preamble_pdf_file = self.filepath('preamble.pdf')
 244         self.tail_html_file = self.filepath('tail.html')
 245         self.tail_pdf_file = self.filepath('tail.pdf')
 246         self.isbn_pdf_file = None
 247         self.pdf_file = self.filepath('final.pdf')
 248         self.body_odt_file = self.filepath('body.odt')
 249         self.outline_file = self.filepath('outline.txt')
 250
 251         self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
 252
 253         if page_settings is not None:
 254             self.maker = PageSettings(**page_settings)
 255
 256         if title is not None:
 257             self.title = title
 258         else:
 259             titles = get_metadata(self.metadata, 'title')
 260             if titles:
 261                 self.title = titles[0]
 262             else:
 263                 self.title = 'A Book About ' + self.book
 264         if isinstance(self.title, unicode):
 265             self.title = self.title.encode('utf-8')
 266
 267         self.notify_watcher()
 268
 269
 270     if config.TRY_BOOK_CLEANUP_ON_DEL:
 271         #Dont even define __del__ if it is not used.
 272         _try_cleanup_on_del = True
 273         def __del__(self):
 274             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 275                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 276                 self.cleanup()
 277
 278     def get_tree_by_id(self, id):
 279         """get an HTML tree from the given manifest ID"""
 280         name = self.manifest[id]['url']
 281         mimetype = self.manifest[id]['mimetype']
 282         s = self.store.read(name)
 283         f = StringIO(s)
 284         if mimetype == 'text/html':
 285             try:
 286                 tree = lxml.html.parse(f)
 287             except etree.XMLSyntaxError, e:
 288                 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
 289                     (id, name, s[:20], e))
 290                 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 291         elif 'xml' in mimetype: #XXX or is this just asking for trouble?
 292             tree = etree.parse(f)
 293         else:
 294             tree = f.read()
 295         f.close()
 296         return tree
 297
 298     def filepath(self, fn):
 299         return os.path.join(self.workdir, fn)
 300
 301     def save_tempfile(self, fn, data):
 302         """Save the data in a temporary directory that will be cleaned
 303         up when all is done.  Return the absolute file path."""
 304         fn = self.filepath(fn)
 305         save_data(fn, data)
 306         return fn
 307
 308     def make_oo_doc(self):
 309         """Make an openoffice document, using the html2odt script."""
 310         self.wait_for_xvfb()
 311         html_text = etree.tostring(self.tree, method="html")
 312         save_data(self.body_html_file, html_text)
 313         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 314         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 315         os.rename(self.body_odt_file, self.publish_file)
 316         self.notify_watcher()
 317
 318     def extract_pdf_outline(self):
 319         """Get the outline (table of contents) for the PDF, which
 320         wkhtmltopdf should have written to a file.  If that file
 321         doesn't exist (or config says not to use it), fall back to
 322         using self._extract_pdf_outline_the_old_way, below.
 323         """
 324         if config.USE_DUMP_OUTLINE:
 325             try:
 326                 self.outline_contents, number_of_pages = \
 327                                        parse_extracted_outline(self.outline_file)
 328
 329             except Exception, e:
 330                 traceback.print_exc()
 331                 number_of_pages = self._extract_pdf_outline_the_old_way()
 332         else:
 333             number_of_pages = self._extract_pdf_outline_the_old_way()
 334
 335         self.notify_watcher()
 336         return number_of_pages
 337
 338     def _extract_pdf_outline_the_old_way(self):
 339         """Try to get the PDF outline using pdftk.  This doesn't work
 340         well with all scripts."""
 341         debugf = self.filepath('extracted-outline.txt')
 342         self.outline_contents, number_of_pages = \
 343                 parse_outline(self.body_pdf_file, 1, debugf)
 344
 345         if not self.outline_contents:
 346             #probably problems with international text. need a horrible hack
 347             log('no outline: trying again with ascii headings')
 348             import copy
 349             tree = copy.deepcopy(self.tree)
 350             titlemap = {}
 351             for tag in ('h1', 'h2', 'h3', 'h4'):
 352                 for i, e in enumerate(tree.getiterator(tag)):
 353                     key = "%s_%s" % (tag, i)
 354                     titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
 355                     del e[:]
 356                     if tag == 'h1':
 357                         e = lxml.etree.SubElement(e, "strong", Class="initial")
 358                     e.text = key
 359                     log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
 360
 361             ascii_html_file = self.filepath('body-ascii-headings.html')
 362             ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
 363             html_text = lxml.etree.tostring(tree, method="html")
 364             save_data(ascii_html_file, html_text)
 365             self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
 366             debugf = self.filepath('ascii-extracted-outline.txt')
 367             ascii_contents, number_of_ascii_pages = \
 368                 parse_outline(ascii_pdf_file, 1, debugf)
 369             self.outline_contents = []
 370             log ("number of pages: %s, post ascii: %s" %
 371                  (number_of_pages, number_of_ascii_pages))
 372             for ascii_title, depth, pageno in ascii_contents:
 373                 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
 374                     ascii_title = ascii_title[:-4]
 375                 if ' ' in ascii_title:
 376                     ascii_title = ascii_title.rsplit(' ', 1)[1]
 377                 title = titlemap.get(ascii_title, '')
 378                 log((ascii_title, title, depth, pageno))
 379
 380                 self.outline_contents.append((title, depth, pageno))
 381
 382         return number_of_pages
 383
 384     def make_body_pdf(self):
 385         """Make a pdf of the HTML, using webkit"""
 386         #1. Save the html
 387         html_text = etree.tostring(self.tree, method="html")
 388         save_data(self.body_html_file, html_text)
 389
 390         #2. Make a pdf of it
 391         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True, outline_file=self.outline_file)
 392         self.notify_watcher('generate_pdf')
 393
 394         n_pages = self.extract_pdf_outline()
 395
 396         log ("found %s pages in pdf" % n_pages)
 397         #4. resize pages, shift gutters, even pages
 398         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 399         self.notify_watcher('reshape_pdf')
 400
 401         #5 add page numbers
 402         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 403                               numbers=self.page_numbers)
 404         self.notify_watcher("number_pdf")
 405         self.notify_watcher()
 406
 407     def make_preamble_pdf(self):
 408         contents = self.make_contents()
 409         inside_cover_html = self.compose_inside_cover()
 410         log_types(self.dir, self.css_url, self.title, inside_cover_html,
 411                   self.toc_header, contents, self.title)
 412
 413         html = ('<html dir="%s"><head>\n'
 414                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 415                 '<link rel="stylesheet" href="%s" />\n'
 416                 '</head>\n<body>\n'
 417                 '<h1 class="frontpage">%s</h1>'
 418                 '%s\n'
 419                 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
 420                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 421                 '<!--%s--></div></body></html>'
 422                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 423                      self.toc_header, contents, self.title)
 424         save_data(self.preamble_html_file, html)
 425
 426         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 427
 428         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 429
 430         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 431                             numbers=self.preamble_page_numbers,
 432                             number_start=-2)
 433
 434         self.notify_watcher()
 435
 436     def make_end_matter_pdf(self):
 437         """Make an inside back cover and a back cover.  If there is an
 438         isbn number its barcode will be put on the back cover."""
 439         if self.isbn:
 440             self.isbn_pdf_file = self.filepath('isbn.pdf')
 441             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 442             self.notify_watcher('make_barcode_pdf')
 443
 444         end_matter = self.compose_end_matter()
 445         #log(end_matter)
 446         save_data(self.tail_html_file, end_matter.decode('utf-8'))
 447         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 448
 449         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 450                                centre_end=True, even_pages=False)
 451         self.notify_watcher()
 452
 453     def make_book_pdf(self):
 454         """A convenient wrapper of a few necessary steps"""
 455         # now the Xvfb server is needed. make sure it has had long enough to get going
 456         self.wait_for_xvfb()
 457         self.make_body_pdf()
 458         self.make_preamble_pdf()
 459         self.make_end_matter_pdf()
 460
 461         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 462                     self.body_pdf_file, self.tail_pdf_file,
 463                     self.isbn_pdf_file)
 464
 465         self.notify_watcher('concatenated_pdfs')
 466
 467
 468     def make_simple_pdf(self, mode):
 469         """Make a simple pdf document without contents or separate
 470         title page.  This is used for multicolumn newspapers and for
 471         web-destined pdfs."""
 472         self.wait_for_xvfb()
 473         #0. Add heading to begining of html
 474         body = list(self.tree.cssselect('body'))[0]
 475         e = body.makeelement('h1', {'id': 'book-title'})
 476         e.text = self.title
 477         body.insert(0, e)
 478         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 479         e.addnext(intro)
 480
 481         #0.5 adjust parameters to suit the particular kind of output
 482         if mode == 'web':
 483             self.maker.gutter = 0
 484
 485         #1. Save the html
 486         html_text = etree.tostring(self.tree, method="html")
 487         save_data(self.body_html_file, html_text)
 488
 489         #2. Make a pdf of it (direct to to final pdf)
 490         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True, outline_file=self.outline_file)
 491         self.notify_watcher('generate_pdf')
 492         n_pages = count_pdf_pages(self.pdf_file)
 493
 494         if mode != 'web':
 495             #3. resize pages and shift gutters.
 496             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 497             self.notify_watcher('reshape_pdf')
 498
 499             #4. add page numbers
 500             self.maker.number_pdf(self.pdf_file, n_pages,
 501                                   dir=self.dir, numbers=self.page_numbers)
 502             self.notify_watcher("number_pdf")
 503         self.notify_watcher()
 504
 505
 506     def rotate180(self):
 507         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 508         presses."""
 509         rotated = self.filepath('final-rotate.pdf')
 510         unrotated = self.filepath('final-pre-rotate.pdf')
 511         #leave the unrotated pdf intact at first, in case of error.
 512         rotate_pdf(self.pdf_file, rotated)
 513         os.rename(self.pdf_file, unrotated)
 514         os.rename(rotated, self.pdf_file)
 515         self.notify_watcher()
 516
 517     def publish_pdf(self):
 518         """Move the finished PDF to its final resting place"""
 519         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 520         os.rename(self.pdf_file, self.publish_file)
 521         self.notify_watcher()
 522
 523     def publish_bookizip(self):
 524         """Publish the bookizip.  For this, copy rather than move,
 525         because the bookizip might be used by further processing.  If
 526         possible, a hard link is created."""
 527         log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
 528         try:
 529             run(['cp', '-l', self.bookizip_file, self.publish_file])
 530         except OSError:
 531             run(['cp', self.bookizip_file, self.publish_file])
 532         self.notify_watcher()
 533
 534     def concat_html(self):
 535         """Join all the chapters together into one tree.  Keep the TOC
 536         up-to-date along the way."""
 537
 538         #each manifest item looks like:
 539         #{'contributors': []
 540         #'license': [],
 541         #'mimetype': '',
 542         #'rightsholders': []
 543         #'url': ''}
 544         doc = lxml.html.document_fromstring('<html><body></body></html>')
 545         tocmap = filename_toc_map(self.toc)
 546         for ID in self.spine:
 547             details = self.manifest[ID]
 548             #log(ID, pformat(details))
 549             # ACO MIJENJAO
 550             try:
 551                 root = self.get_tree_by_id(ID).getroot()
 552             except Exception, e:
 553                 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
 554                 continue
 555             #handle any TOC points in this file
 556             for point in tocmap[details['url']]:
 557                 #if the url has a #identifier, use it. Otherwise, make
 558                 #one up, using a hidden element at the beginning of
 559                 #the inserted document.
 560                 #XXX this will break if different files use the same ids
 561                 #XXX should either replace all, or replace selectively.
 562                 if point['fragment']:
 563                     fragment = point['fragment']
 564                 else:
 565                     body = _find_tag(root, 'body')
 566                     fragment = '%s_%s' % (self.cookie, point['index'])
 567                     #reuse first tag if it is suitable.
 568                     if (len(body) and
 569                         body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
 570                         if body[0].get('id') is None:
 571                             body[0].set('id', fragment)
 572                         else:
 573                             fragment = body[0].get('id')
 574                         #the chapter starts with a heading. that heading should be the chapter name.
 575                         if body[0].tag in ('h1', 'h2', 'h3'):
 576                             #log('chapter has title "%s", found html title "%s"' %
 577                             #    (point['title'], body[0].text_content()))
 578                             point['html_title'] = body[0].text_content()
 579                     else:
 580                         marker = body.makeelement('div', style="display:none",
 581                                                   id=fragment)
 582                         body.insert(0, marker)
 583                 point['html_id'] = fragment
 584
 585             add_guts(root, doc)
 586         return doc
 587
 588     def unpack_static(self):
 589         """Extract static files from the zip for the html to refer to."""
 590         static_files = [x['url'] for x in self.manifest.values()
 591                         if x['url'].startswith('static')]
 592         if static_files:
 593             os.mkdir(self.filepath('static'))
 594
 595         for name in static_files:
 596             s = self.store.read(name)
 597             f = open(self.filepath(name), 'w')
 598             f.write(s)
 599             f.close()
 600         self.notify_watcher()
 601
 602     def load_book(self):
 603         """"""
 604         #XXX concatenate the HTML to match how TWiki version worked.
 605         # This is perhaps foolishly early -- throwing away useful boundaries.
 606         self.unpack_static()
 607         self.tree = self.concat_html()
 608         self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
 609
 610         self.headings = [x for x in self.tree.cssselect('h1')]
 611         if self.headings:
 612             self.headings[0].set('class', "first-heading")
 613         for h1 in self.headings:
 614             h1.title = h1.text_content().strip()
 615         self.notify_watcher()
 616
 617     def make_contents(self):
 618         """Generate HTML containing the table of contents.  This can
 619         only be done after the main PDF has been made, because the
 620         page numbers are contained in the PDF outline."""
 621         header = '<table class="toc">\n'
 622         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 623                     '<td class="pagenumber">%s</td></tr>\n')
 624         empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
 625         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 626         footer = '\n</table>'
 627
 628         contents = []
 629
 630         chapter = 1
 631         page_num = 1
 632         #log(self.outline_contents)
 633         outline_contents = iter(self.outline_contents)
 634
 635         for section in self.toc:
 636             if not section.get('children'):
 637                 contents.append(empty_section_tmpl % section['title'])
 638                 continue
 639             contents.append(section_tmpl % section['title'])
 640
 641             for point in section['children']:
 642                 try:
 643                     level = 99
 644                     while level > 1:
 645                         h1_text, level, page_num = outline_contents.next()
 646                 except StopIteration:
 647                     log("contents data not found for %s. Stopping" % (point,))
 648                     break
 649                 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
 650                 chapter += 1
 651
 652         doc = header + '\n'.join(contents) + footer
 653         if isinstance(doc, unicode):
 654             doc = doc.encode('utf-8')
 655         self.notify_watcher()
 656         return doc
 657
 658     def add_section_titles(self):
 659         """Add any section heading pages that the TOC.txt file
 660         specifies.  These are sub-book, super-chapter groupings.
 661
 662         Also add initial numbers to chapters.
 663         """
 664         chapter = 1
 665         section = None
 666         #log(self.toc)
 667         for t in self.toc:
 668             #only top level sections get a subsection page,
 669             #and only if they have children.
 670             if t.get('children'):
 671                 section = self.tree.makeelement('div', Class="objavi-subsection")
 672                 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
 673                 heading.text = t['title']
 674                 for child in t['children']:
 675                     item = etree.SubElement(section, 'div', Class="objavi-chapter")
 676                     if 'html_title' in child:
 677                         item.text = child['html_title']
 678                         heading = self.tree.cssselect('#'+ child['html_id'])
 679                         if heading:
 680                             _add_initial_number(heading[0], chapter)
 681                     else:
 682                         item.text = child['title']
 683                     _add_initial_number(item, chapter)
 684                     log(item.text, debug='HTMLGEN')
 685                     chapter += 1
 686                 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
 687                 location = self.tree.cssselect('#'+ t['html_id'])[0]
 688                 location.addprevious(section)
 689
 690
 691         self.notify_watcher()
 692
 693
 694     def add_css(self, css=None, mode='book'):
 695         """If css looks like a url, use it as a stylesheet link.
 696         Otherwise it is the CSS itself, which is saved to a temporary file
 697         and linked to."""
 698         log("css is %r" % css)
 699         htmltree = self.tree
 700         if css is None or not css.strip():
 701             css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
 702             if css_default is None:
 703                 #guess from language -- this should come first
 704                 css_modes = config.LANGUAGE_CSS.get(self.lang,
 705                                                     config.LANGUAGE_CSS['en'])
 706                 css_default = css_modes.get(mode, css_modes[None])
 707             url = 'file://' + os.path.abspath(url2path(css_default))
 708         elif not re.match(r'^http://\S+$', css):
 709             fn = self.save_tempfile('objavi.css', css)
 710             url = 'file://' + fn
 711         else:
 712             url = css
 713
 714         #find the head -- it's probably first child but lets not assume.
 715         for child in htmltree:
 716             if child.tag == 'head':
 717                 head = child
 718                 break
 719         else:
 720             head = htmltree.makeelement('head')
 721             htmltree.insert(0, head)
 722
 723         link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 724         self.css_url = url
 725         self.notify_watcher()
 726         return url
 727
 728
 729     def _read_localised_template(self, template, fallbacks=['en']):
 730         """Try to get the template in the approriate language, otherwise in english."""
 731         for lang in [self.lang] + fallbacks:
 732             try:
 733                 fn = template % (lang)
 734                 f = open(fn)
 735                 break
 736             except IOError, e:
 737                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 738                 log(e)
 739         template = f.read()
 740         f.close()
 741         return template
 742
 743     def compose_inside_cover(self):
 744         """create the markup for the preamble inside cover."""
 745         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 746
 747         if self.isbn:
 748             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 749         else:
 750             isbn_text = ''
 751
 752         return template % {'date': time.strftime('%Y-%m-%d'),
 753                            'isbn': isbn_text,
 754                            'license': self.license,
 755                            }
 756
 757
 758     def compose_end_matter(self):
 759         """create the markup for the end_matter inside cover.  If
 760         self.isbn is not set, the html will result in a pdf that
 761         spills onto two pages.
 762         """
 763         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 764
 765         d = {'css_url': self.css_url,
 766              'title': self.title
 767              }
 768
 769         if self.isbn:
 770             d['inside_cover_style'] = ''
 771         else:
 772             d['inside_cover_style'] = 'page-break-after: always'
 773
 774         return template % d
 775
 776
 777     def make_epub(self, use_cache=False):
 778         """Make an epub version of the book, using Mike McCabe's
 779         epub module for the Internet Archive."""
 780         ebook = ia_epub.Book(self.publish_file, content_dir='')
 781         def add_file(ID, filename, mediatype, content):
 782             ebook.add_content({'media-type': mediatype.encode('utf-8'),
 783                                'id': ID.encode('utf-8'),
 784                                'href': filename.encode('utf-8'),
 785                                }, content)
 786
 787         toc = self.info['TOC']
 788
 789         #manifest
 790         filemap = {} #map html to corresponding xhtml
 791         spinemap = {} #map IDs to multi-file chapters
 792         for ID in self.manifest:
 793             details = self.manifest[ID]
 794             #log(ID, pformat(details))
 795             fn, mediatype = details['url'], details['mimetype']
 796             content = self.store.read(fn)
 797             if mediatype == 'text/html':
 798                 #convert to application/xhtml+xml, and perhaps split
 799                 c = EpubChapter(self.server, self.book, ID, content,
 800                                 use_cache=use_cache)
 801                 c.remove_bad_tags()
 802                 if fn[-5:] == '.html':
 803                     fnbase = fn[:-5]
 804                 else:
 805                     fnbase = fn
 806                 fnx = fnbase + '.xhtml'
 807                 mediatype = 'application/xhtml+xml'
 808
 809                 fragments = split_html(c.as_xhtml(),
 810                                        compressed_size=self.store.getinfo(fn).compress_size)
 811
 812                 #add the first one as if it is the whole thing (as it often is)
 813                 add_file(ID, fnx, mediatype, fragments[0])
 814                 filemap[fn] = fnx
 815                 if len(fragments) > 1:
 816                     spine_ids = [ID]
 817                     spinemap[ID] = spine_ids
 818                     #add any extras
 819                     for i in range(1, len(fragments)):
 820                         # XXX it is possible for duplicates if another
 821                         # file happens to have this name. Ignore for now
 822                         _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
 823                         spine_ids.append(_id)
 824                         add_file(_id,
 825                                  '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
 826                                  mediatype, fragments[i])
 827
 828             else:
 829                 add_file(ID, fn, mediatype, content)
 830
 831         #toc
 832         ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
 833         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 834
 835         #spine
 836         for ID in self.spine:
 837             if ID in spinemap:
 838                 for x in spinemap[ID]:
 839                     ebook.add_spine_item({'idref': x})
 840             else:
 841                 ebook.add_spine_item({'idref': ID})
 842
 843         #metadata -- no use of attributes (yet)
 844         # and fm: metadata disappears for now
 845         DCNS = config.DCNS
 846         DC = config.DC
 847         meta_info_items = []
 848         for ns, namespace in self.metadata.items():
 849             for keyword, schemes in namespace.items():
 850                 if ns:
 851                     keyword = '{%s}%s' % (ns, keyword)
 852                 for scheme, values in schemes.items():
 853                     for value in values:
 854                         item = {
 855                             'item': keyword,
 856                             'text': value,
 857                             }
 858                         if scheme:
 859                             if keyword in (DCNS + 'creator', DCNS + 'contributor'):
 860                                 item['atts'] = {'role': scheme}
 861                             else:
 862                                 item['atts'] = {'scheme': scheme}
 863
 864         has_authors = 'creator' in self.metadata[DC]
 865         if not has_authors and config.CLAIM_UNAUTHORED:
 866             authors = []
 867             for x in self.metadata[DC]['creator'].values():
 868                 authors.extend(x)
 869
 870             meta_info_items.append({'item': DCNS + 'creator',
 871                                     'text': 'The Contributors'})
 872
 873             meta_info_items.append({'item': DCNS + 'rights',
 874                                     'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 875                                    )
 876
 877         tree_str = ia_epub.make_opf(meta_info_items,
 878                                     ebook.manifest_items,
 879                                     ebook.spine_items,
 880                                     ebook.guide_items,
 881                                     ebook.cover_id)
 882         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 883         ebook.z.close()
 884         self.notify_watcher()
 885
 886
 887     def publish_s3(self):
 888         """Push the book's epub to archive.org, using S3."""
 889         #XXX why only epub?
 890         secrets = {}
 891         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 892             fn = getattr(config, x)
 893             f = open(fn)
 894             secrets[x] = f.read().strip()
 895             f.close()
 896
 897         now = time.strftime('%F')
 898         s3output = self.filepath('s3-output.txt')
 899         s3url, detailsurl = find_archive_urls(self.book, self.bookname)
 900         headers = [
 901             'x-amz-auto-make-bucket:1',
 902             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
 903             'x-archive-meta-mediatype:texts',
 904             'x-archive-meta-collection:opensource',
 905             'x-archive-meta-title:%s' % (self.book,),
 906             'x-archive-meta-date:%s' % (now,),
 907             'x-archive-meta-creator:FLOSS Manuals Contributors',
 908             ]
 909
 910         if self.license in config.LICENSES:
 911             headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
 912
 913         argv = ['curl', '--location', '-s', '-o', s3output]
 914         for h in headers:
 915             argv.extend(('--header', h))
 916         argv.extend(('--upload-file', self.publish_file, s3url,))
 917
 918         log(' '.join(repr(x) for x in argv))
 919         check_call(argv, stdout=sys.stderr)
 920         self.notify_watcher()
 921         return detailsurl, s3url
 922
 923
 924     def spawn_x(self):
 925         """Start an Xvfb instance, using a new server number.  A
 926         reference to it is stored in self.xvfb, which is used to kill
 927         it when the pdf is done.
 928
 929         Note that Xvfb doesn't interact well with dbus which is
 930         present on modern desktops.
 931         """
 932         #Find an unused server number (in case two cgis are running at once)
 933         while True:
 934             servernum = random.randrange(50, 500)
 935             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 936                 break
 937
 938         self.xserver_no = ':%s' % servernum
 939
 940         authfile = self.filepath('Xauthority')
 941         os.environ['XAUTHORITY'] = authfile
 942
 943         #mcookie(1) eats into /dev/random, so avoid that
 944         from hashlib import md5
 945         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 946         mcookie = m.hexdigest()
 947
 948         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 949
 950         self.xvfb = Popen(['Xvfb', self.xserver_no,
 951                            '-screen', '0', '1024x768x24',
 952                            '-pixdepths', '32',
 953                            #'-blackpixel', '0',
 954                            #'-whitepixel', str(2 ** 24 -1),
 955                            #'+extension', 'Composite',
 956                            '-dpi', '96',
 957                            #'-kb',
 958                            '-nolisten', 'tcp',
 959                            ])
 960
 961         # We need to wait a bit before the Xvfb is ready.  but the
 962         # downloads are so slow that that probably doesn't matter
 963
 964         self.xvfb_ready_time = time.time() + 2
 965
 966         os.environ['DISPLAY'] = self.xserver_no
 967         log(self.xserver_no)
 968
 969     def wait_for_xvfb(self):
 970         """wait until a previously set time before continuing.  This
 971         is so Xvfb has time to properly start."""
 972         if hasattr(self, 'xvfb'):
 973             d = self.xvfb_ready_time - time.time()
 974             if d > 0:
 975                 time.sleep(d)
 976                 self.notify_watcher()
 977
 978     def cleanup_x(self):
 979         """Try very hard to kill off Xvfb.  In addition to killing
 980         this instance's xvfb, occasionally (randomly) search for
 981         escaped Xvfb instances and kill those too."""
 982         if not hasattr(self, 'xvfb'):
 983             return
 984         check_call(['xauth', 'remove', self.xserver_no])
 985         p = self.xvfb
 986         log("trying to kill Xvfb %s" % p.pid)
 987         os.kill(p.pid, 15)
 988         for i in range(10):
 989             if p.poll() is not None:
 990                 log("%s died with %s" % (p.pid, p.poll()))
 991                 break
 992             log("%s not dead yet" % p.pid)
 993             time.sleep(0.2)
 994         else:
 995             log("Xvfb would not die! kill -9! kill -9!")
 996             try:
 997                 os.kill(p.pid, 9)
 998             except OSError, e:
 999                 log(e)
1000
1001         if random.random() < 0.1:
1002             # occasionally kill old xvfbs and soffices, if there are any.
1003             self.kill_old_processes()
1004
1005     def kill_old_processes(self):
1006         """Sometimes, despite everything, Xvfb or soffice instances
1007         hang around well after they are wanted -- for example if the
1008         cgi process dies particularly badly. So kill them if they have
1009         been running for a long time."""
1010         log("running kill_old_processes")
1011         killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1012                                    os.path.basename(config.HTML2ODT),
1013                                    os.path.basename(config.WKHTMLTOPDF),
1014                                    ])
1015         p = Popen(['ps', '-C', killable_names,
1016                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
1017         data = p.communicate()[0].strip()
1018         if data:
1019             lines = data.split('\n')
1020             pids = []
1021             for line in lines:
1022                 log('dealing with ps output "%s"' % line)
1023                 try:
1024                     pid, days, hours, minutes, seconds \
1025                          = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1026                 except AttributeError:
1027                     log("Couldn't parse that line!")
1028                 # 50 minutes should be enough xvfb time for anyone
1029                 if days or hours or int(minutes) > 50:
1030                     pid = int(pid)
1031                     log("going to kill pid %s" % pid)
1032                     os.kill(pid, 15)
1033                     pids.append(pid)
1034
1035             time.sleep(1.0)
1036             for pid in pids:
1037                 #try again in case any are lingerers
1038                 try:
1039                     os.kill(int(pid), 9)
1040                 except OSError, e:
1041                     log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1042                     continue
1043                 log('killing %s with -9' % pid)
1044         self.notify_watcher()
1045
1046     def cleanup(self):
1047         self.cleanup_x()
1048         if not config.KEEP_TEMP_FILES:
1049             for fn in os.listdir(self.workdir):
1050                 os.remove(os.path.join(self.workdir, fn))
1051             os.rmdir(self.workdir)
1052         else:
1053             log("NOT removing '%s', containing the following files:" % self.workdir)
1054             log(*os.listdir(self.workdir))
1055
1056         self.notify_watcher()
1057
1058
1059 def use_cache():
1060     return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1061
1062 def _read_cached_zip(server, book, max_age):
1063     #find a recent zip if possible
1064     prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1065     from glob import glob
1066     zips = sorted(glob(prefix + '*.zip'))
1067     if not zips:
1068         log("no cached booki-zips matching %s*.zip" % (prefix,))
1069         return None
1070     zipname = zips[-1]
1071     cutoff = time.time() - max_age * 60
1072     log(repr(zipname))
1073     try:
1074         date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1075         if date > cutoff:
1076             f = open(zipname)
1077             blob = f.read()
1078             f.close()
1079             return blob, zipname
1080         log("%s is too old, must reload" % zipname)
1081         return None
1082     except (IOError, IndexError, ValueError), e:
1083         log('could not make sense of %s: got exception %s' % (zipname, e))
1084         return None
1085
1086
1087 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1088     interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1089     try:
1090         url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1091                                             'server': server, 'book':book}
1092     except KeyError:
1093         raise NotImplementedError("Can't handle '%s' interface" % interface)
1094
1095     if use_cache() and max_age < 0:
1096         #default to 12 hours cache on objavi.halo.gen.nz
1097         max_age = 12 * 60
1098
1099     if max_age:
1100         log('WARNING: trying to use cached booki-zip',
1101             'If you are debugging booki-zip creation, you will go CRAZY'
1102             ' unless you switch this off')
1103         blob_and_name = _read_cached_zip(server, book, max_age)
1104         if blob_and_name is not None:
1105             return blob_and_name
1106
1107     log('fetching zip from %s'% url)
1108     f = urlopen(url)
1109     blob = f.read()
1110     f.close()
1111     if save:
1112         if filename is None:
1113             filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1114                                   make_book_name(book, server, '.zip'))
1115         f = open(filename, 'w')
1116         f.write(blob)
1117         f.close()
1118     return blob, filename
1119
1120
1121 def split_html(html, compressed_size=None, fix_markup=False):
1122     """Split long html files into pieces that will work nicely on a
1123     Sony Reader."""
1124     if compressed_size is None:
1125         import zlib
1126         compressed_size = len(zlib.compress(html))
1127
1128     splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1129                  len(html) // config.EPUB_FILE_SIZE_MAX)
1130     log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1131
1132     if not splits:
1133         return [html]
1134
1135     if fix_markup:
1136         #remove '<' in attributes etc, which makes the marker
1137         #insertion more reliable
1138         html = etree.tostring(lxml.html.fromstring(html),
1139                               encoding='UTF-8',
1140                               #method='html'
1141                               )
1142
1143     target = len(html) // (splits + 1)
1144     s = 0
1145     fragments = []
1146     for i in range(splits):
1147         e = html.find('<', target * (i + 1))
1148         fragments.append(html[s:e])
1149         fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1150         s = e
1151     fragments.append(html[s:])
1152
1153     #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1154     chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1155     return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]
1156