objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 import copy
  28 from subprocess import Popen, check_call, PIPE
  29 from cStringIO import StringIO
  30 from urllib2 import urlopen, HTTPError
  31 import zipfile
  32 import traceback
  33 from string import ascii_letters
  34 from pprint import pformat
  35
  36 try:
  37     import json
  38 except ImportError:
  39     import simplejson as json
  40
  41 import lxml.html
  42 from lxml import etree
  43
  44 from objavi import config, epub_utils
  45 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
  46 from objavi.book_utils import ObjaviError, log_types
  47 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline, parse_extracted_outline
  48 from objavi.epub import add_guts, _find_tag
  49 from objavi.xhtml_utils import EpubChapter, split_tree
  50 from objavi.cgi_utils import url2path, path2url
  51
  52 from iarchive import epub as ia_epub
  53 from booki.bookizip import get_metadata, add_metadata
  54
  55 TMPDIR = os.path.abspath(config.TMPDIR)
  56 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
  57 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  58
  59 def find_archive_urls(bookid, bookname):
  60     s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
  61     detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
  62     return (s3url, detailsurl)
  63
  64 def _get_best_title(tocpoint):
  65     if 'html_title' in tocpoint:
  66         return tocpoint['html_title']
  67     if 'title' in tocpoint:
  68         return tocpoint['title']
  69     return 'Untitled'
  70
  71
  72 def _add_initial_number(e, n):
  73     """Put a styled chapter number n at the beginning of element e."""
  74     initial = e.makeelement("strong", Class="initial")
  75     e.insert(0, initial)
  76     initial.tail = ' '
  77     if e.text is not None:
  78         initial.tail += e.text
  79     e.text = ''
  80     initial.text = "%s." % n
  81
  82 def expand_toc(toc, depth=1, index=0):
  83     """Reformat toc slightly for convenience"""
  84     for item in toc:
  85         url = item.get('url', '').lstrip('/')
  86         bits = url.split('#', 1)
  87         filename = bits[0]
  88         fragment = (bits[1] if len(bits) == 2 else None)
  89         item['depth'] = depth
  90         item["filename"] = filename
  91         item["fragment"] = fragment
  92         item["index"] = index
  93         index += 1
  94         if 'children' in item:
  95             index = expand_toc(item['children'], depth + 1, index)
  96     return index
  97
  98 def _serialise(rtoc, stoc, depth):
  99     for item in rtoc:
 100         url = item['url'].lstrip('/')
 101         bits = url.split('#', 1)
 102         filename = bits[0]
 103         fragment = (bits[1] if len(bits) == 2 else None)
 104         stoc.append({"depth": depth,
 105                      "title": item['title'],
 106                      "url": url,
 107                      "filename": filename,
 108                      "fragment": fragment,
 109                      "type": item['type']
 110                      })
 111         if 'children' in item:
 112             _serialise(item['children'], stoc, depth + 1)
 113
 114
 115 def serialise_toc(rtoc):
 116     """Take the recursive TOC structure and turn it into a list of
 117     serial points.  Reformat some things for convenience."""
 118     stoc = []
 119     _serialise(rtoc, stoc, 1)
 120     for i, x in enumerate(stoc):
 121         x['position'] = i
 122     return stoc
 123
 124 def filename_toc_map(rtoc):
 125     tocmap = {}
 126     #log(rtoc)
 127     def traverse(toc):
 128         for point in toc:
 129             #log(point.keys())
 130             tocmap.setdefault(point['filename'], []).append(point)
 131             if 'children' in point:
 132                 traverse(point['children'])
 133     traverse(rtoc)
 134     return tocmap
 135
 136 def save_data(fn, data):
 137     """Save without tripping up on unicode"""
 138     if isinstance(data, unicode):
 139         data = data.encode('utf8', 'ignore')
 140     f = open(fn, 'w')
 141     f.write(data)
 142     f.close()
 143
 144
 145 class Book(object):
 146     page_numbers = 'latin'
 147     preamble_page_numbers = 'roman'
 148
 149     def notify_watcher(self, message=None):
 150         if self.watchers:
 151             if  message is None:
 152                 #message is the name of the caller
 153                 message = traceback.extract_stack(None, 2)[0][2]
 154             log("notify_watcher called with '%s'" % message)
 155             for w in self.watchers:
 156                 w(message)
 157
 158     def __enter__(self):
 159         return self
 160
 161     def __exit__(self, exc_type, exc_value, tb):
 162         self.notify_watcher(config.FINISHED_MESSAGE)
 163         self.cleanup()
 164         #could deal with exceptions here and return true
 165
 166
 167     def __init__(self, book, server, bookname,
 168                  page_settings=None, watchers=None, isbn=None,
 169                  license=config.DEFAULT_LICENSE, title=None,
 170                  max_age=0):
 171         log("*** Starting new book %s ***" % bookname)
 172         self.watchers = set()
 173         if watchers is not None:
 174             self.watchers.update(watchers)
 175         self.notify_watcher('start')
 176         self.bookname = bookname
 177         self.book = book
 178         self.server = server
 179         self.cookie = ''.join(random.sample(ascii_letters, 10))
 180         try:
 181             blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
 182         except HTTPError, e:
 183             traceback.print_exc()
 184             self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
 185             #not much to do?
 186             #raise 502 Bad Gateway ?
 187             sys.exit()
 188         f = StringIO(blob)
 189         self.notify_watcher('fetch_zip')
 190         self.store = zipfile.ZipFile(f, 'r')
 191         self.info = json.loads(self.store.read('info.json'))
 192         for k in ('manifest', 'metadata', 'spine', 'TOC'):
 193             if k not in self.info:
 194                 raise ObjaviError('info.json of %s lacks vital element "%s"' %
 195                                   (bookname, k))
 196             #check types also?
 197
 198         self.metadata = self.info['metadata']
 199         self.spine = self.info['spine']
 200         self.manifest = self.info['manifest']
 201
 202         if server == config.LOCALHOST: # [DEPRECATED]
 203             server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
 204             book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
 205
 206         log(pformat(self.metadata))
 207         self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
 208         if not self.lang:
 209             self.lang = guess_lang(server, book)
 210             log('guessed lang as %s' % self.lang)
 211
 212         self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
 213         if not self.toc_header:
 214             self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
 215
 216         self.dir = str(get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0])
 217         if not self.dir:
 218             self.dir = guess_text_dir(server, book)
 219
 220         #Patch in the extra metadata. (lang and dir may be set from config)
 221         #these should be read from zip -- so should go into zip?
 222         for var, key, scheme, ns in (
 223             (isbn, 'id', 'ISBN', config.DC),
 224             (license, 'rights', 'License', config.DC),
 225             (title, 'title', '', config.DC),
 226             (self.lang, 'language', '', config.DC),
 227             (self.dir, 'dir', '', config.FM),
 228             ):
 229             if var is not None:
 230                 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
 231
 232         self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
 233         self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
 234
 235         self.toc = self.info['TOC']
 236         expand_toc(self.toc)
 237
 238         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 239         os.chmod(self.workdir, 0755)
 240
 241         self.body_html_file = self.filepath('body.html')
 242         self.body_pdf_file = self.filepath('body.pdf')
 243         self.preamble_html_file = self.filepath('preamble.html')
 244         self.preamble_pdf_file = self.filepath('preamble.pdf')
 245         self.tail_html_file = self.filepath('tail.html')
 246         self.tail_pdf_file = self.filepath('tail.pdf')
 247         self.isbn_pdf_file = None
 248         self.pdf_file = self.filepath('final.pdf')
 249         self.body_odt_file = self.filepath('body.odt')
 250         self.outline_file = self.filepath('outline.txt')
 251
 252         self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
 253
 254         if page_settings is not None:
 255             self.maker = PageSettings(**page_settings)
 256
 257         if title is not None:
 258             self.title = title
 259         else:
 260             titles = get_metadata(self.metadata, 'title')
 261             if titles:
 262                 self.title = titles[0]
 263             else:
 264                 self.title = 'A Book About ' + self.book
 265         if isinstance(self.title, unicode):
 266             self.title = self.title.encode('utf-8')
 267
 268         self.notify_watcher()
 269
 270
 271     if config.TRY_BOOK_CLEANUP_ON_DEL:
 272         #Dont even define __del__ if it is not used.
 273         _try_cleanup_on_del = True
 274         def __del__(self):
 275             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 276                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 277                 self.cleanup()
 278
 279     def get_tree_by_id(self, id):
 280         """get an HTML tree from the given manifest ID"""
 281         name = self.manifest[id]['url']
 282         mimetype = self.manifest[id]['mimetype']
 283         s = self.store.read(name)
 284         f = StringIO(s)
 285         if mimetype == 'text/html':
 286             try:
 287                 tree = lxml.html.parse(f)
 288             except etree.XMLSyntaxError, e:
 289                 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
 290                     (id, name, s[:20], e))
 291                 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 292         elif 'xml' in mimetype: #XXX or is this just asking for trouble?
 293             tree = etree.parse(f)
 294         else:
 295             tree = f.read()
 296         f.close()
 297         return tree
 298
 299     def filepath(self, fn):
 300         return os.path.join(self.workdir, fn)
 301
 302     def save_tempfile(self, fn, data):
 303         """Save the data in a temporary directory that will be cleaned
 304         up when all is done.  Return the absolute file path."""
 305         fn = self.filepath(fn)
 306         save_data(fn, data)
 307         return fn
 308
 309     def make_oo_doc(self):
 310         """Make an openoffice document, using the html2odt script."""
 311         self.wait_for_xvfb()
 312         html_text = etree.tostring(self.tree, method="html")
 313         save_data(self.body_html_file, html_text)
 314         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 315         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 316         os.rename(self.body_odt_file, self.publish_file)
 317         self.notify_watcher()
 318
 319     def extract_pdf_outline(self):
 320         """Get the outline (table of contents) for the PDF, which
 321         wkhtmltopdf should have written to a file.  If that file
 322         doesn't exist (or config says not to use it), fall back to
 323         using self._extract_pdf_outline_the_old_way, below.
 324         """
 325         if config.USE_DUMP_OUTLINE:
 326             try:
 327                 self.outline_contents, number_of_pages = \
 328                                        parse_extracted_outline(self.outline_file)
 329
 330             except Exception, e:
 331                 traceback.print_exc()
 332                 number_of_pages = self._extract_pdf_outline_the_old_way()
 333         else:
 334             number_of_pages = self._extract_pdf_outline_the_old_way()
 335
 336         self.notify_watcher()
 337         return number_of_pages
 338
 339     def _extract_pdf_outline_the_old_way(self):
 340         """Try to get the PDF outline using pdftk.  This doesn't work
 341         well with all scripts."""
 342         debugf = self.filepath('extracted-outline.txt')
 343         self.outline_contents, number_of_pages = \
 344                 parse_outline(self.body_pdf_file, 1, debugf)
 345
 346         if not self.outline_contents:
 347             #probably problems with international text. need a horrible hack
 348             log('no outline: trying again with ascii headings')
 349             import copy
 350             tree = copy.deepcopy(self.tree)
 351             titlemap = {}
 352             for tag in ('h1', 'h2', 'h3', 'h4'):
 353                 for i, e in enumerate(tree.getiterator(tag)):
 354                     key = "%s_%s" % (tag, i)
 355                     titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
 356                     del e[:]
 357                     if tag == 'h1':
 358                         e = lxml.etree.SubElement(e, "strong", Class="initial")
 359                     e.text = key
 360                     log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
 361
 362             ascii_html_file = self.filepath('body-ascii-headings.html')
 363             ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
 364             html_text = lxml.etree.tostring(tree, method="html")
 365             save_data(ascii_html_file, html_text)
 366             self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
 367             debugf = self.filepath('ascii-extracted-outline.txt')
 368             ascii_contents, number_of_ascii_pages = \
 369                 parse_outline(ascii_pdf_file, 1, debugf)
 370             self.outline_contents = []
 371             log ("number of pages: %s, post ascii: %s" %
 372                  (number_of_pages, number_of_ascii_pages))
 373             for ascii_title, depth, pageno in ascii_contents:
 374                 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
 375                     ascii_title = ascii_title[:-4]
 376                 if ' ' in ascii_title:
 377                     ascii_title = ascii_title.rsplit(' ', 1)[1]
 378                 title = titlemap.get(ascii_title, '')
 379                 log((ascii_title, title, depth, pageno))
 380
 381                 self.outline_contents.append((title, depth, pageno))
 382
 383         return number_of_pages
 384
 385     def make_body_pdf(self):
 386         """Make a pdf of the HTML, using webkit"""
 387         #1. Save the html
 388         html_text = etree.tostring(self.tree, method="html")
 389         save_data(self.body_html_file, html_text)
 390
 391         #2. Make a pdf of it
 392         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True, outline_file=self.outline_file)
 393         self.notify_watcher('generate_pdf')
 394
 395         n_pages = self.extract_pdf_outline()
 396
 397         log ("found %s pages in pdf" % n_pages)
 398         #4. resize pages, shift gutters, even pages
 399         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 400         self.notify_watcher('reshape_pdf')
 401
 402         #5 add page numbers
 403         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 404                               numbers=self.page_numbers)
 405         self.notify_watcher("number_pdf")
 406         self.notify_watcher()
 407
 408     def make_preamble_pdf(self):
 409         contents = self.make_contents()
 410         inside_cover_html = self.compose_inside_cover()
 411         log_types(self.dir, self.css_url, self.title, inside_cover_html,
 412                   self.toc_header, contents, self.title)
 413
 414         html = ('<html dir="%s"><head>\n'
 415                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 416                 '<link rel="stylesheet" href="%s" />\n'
 417                 '</head>\n<body>\n'
 418                 '<h1 class="frontpage">%s</h1>'
 419                 '%s\n'
 420                 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
 421                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 422                 '<!--%s--></div></body></html>'
 423                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 424                      self.toc_header, contents, self.title)
 425         save_data(self.preamble_html_file, html)
 426
 427         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 428
 429         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 430
 431         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 432                             numbers=self.preamble_page_numbers,
 433                             number_start=-2)
 434
 435         self.notify_watcher()
 436
 437     def make_end_matter_pdf(self):
 438         """Make an inside back cover and a back cover.  If there is an
 439         isbn number its barcode will be put on the back cover."""
 440         if self.isbn:
 441             self.isbn_pdf_file = self.filepath('isbn.pdf')
 442             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 443             self.notify_watcher('make_barcode_pdf')
 444
 445         end_matter = self.compose_end_matter()
 446         #log(end_matter)
 447         save_data(self.tail_html_file, end_matter.decode('utf-8'))
 448         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 449
 450         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 451                                centre_end=True, even_pages=False)
 452         self.notify_watcher()
 453
 454     def make_book_pdf(self):
 455         """A convenient wrapper of a few necessary steps"""
 456         # now the Xvfb server is needed. make sure it has had long enough to get going
 457         self.wait_for_xvfb()
 458         self.make_body_pdf()
 459         self.make_preamble_pdf()
 460         self.make_end_matter_pdf()
 461
 462         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 463                     self.body_pdf_file, self.tail_pdf_file,
 464                     self.isbn_pdf_file)
 465
 466         self.notify_watcher('concatenated_pdfs')
 467
 468     def make_templated_html(self, template=None, zip=False, index=config.TEMPLATING_INDEX_FIRST):
 469         """Make a templated html version of the book."""
 470         #set up the directory and static files
 471         self.unpack_static()
 472         destdir = self.filepath('html')
 473         os.mkdir(destdir)
 474         os.rename(self.filepath('static'), self.filepath(os.path.join(destdir, 'static')))
 475
 476         if not template:
 477             template_tree = lxml.html.parse(config.TEMPLATING_DEFAULT_TEMPLATE).getroot()
 478         else:
 479             template_tree = lxml.html.document_fromstring(template)
 480
 481         tocmap = filename_toc_map(self.toc)
 482         contents_name, first_name = config.TEMPLATING_INDEX_MODES[index]
 483
 484         #build a contents page and a contents menu
 485         #We can't make this in the same pass because the menu needs to
 486         #go in every page (i.e., into the template)
 487         menu = etree.Element('ul', Class=config.TEMPLATING_MENU_ELEMENT)
 488         contents = etree.Element('div', Class=config.TEMPLATING_REPLACED_ELEMENT)
 489
 490         booktitle = etree.Element('div', Class=config.TEMPLATING_BOOK_TITLE_ELEMENT)
 491         log(self.title)
 492         booktitle.text = self.title.decode('utf-8')
 493
 494         etree.SubElement(contents, 'h1').text = self.title.decode('utf-8')
 495
 496         savename = first_name
 497         for ID in self.spine:
 498             filename = self.manifest[ID]['url']
 499             #handle any TOC points in this file.
 500             for point in tocmap[filename]:
 501                 if point['type'] == 'booki-section':
 502                     etree.SubElement(contents, 'h2').text = point['title']
 503                     etree.SubElement(menu, 'li', Class='booki-section').text = point['title']
 504                 else:
 505                     if savename is None:
 506                         savename = filename
 507                     div = etree.SubElement(contents, 'div')
 508                     etree.SubElement(div, 'a', href=savename).text = point['title']
 509                     li = etree.SubElement(menu, 'li')
 510                     li.tail = '\n'
 511                     etree.SubElement(li, 'a', href=savename).text = point['title']
 512                     savename = None
 513         #put the menu and book title into the template (if it wants it)
 514         for e in template_tree.iterdescendants(config.TEMPLATING_MENU_ELEMENT):
 515             e.getparent().replace(e, copy.deepcopy(menu))
 516         for e in template_tree.iterdescendants(config.TEMPLATING_BOOK_TITLE_ELEMENT):
 517             e.getparent().replace(e, copy.deepcopy(booktitle))
 518
 519         #function to template content and write to disk
 520         def save_content(content, title, filename):
 521             if not isinstance(title, unicode):
 522                 title = title.decode('utf-8')
 523             content.set('id', config.TEMPLATING_CONTENTS_ID)
 524             content.tag = 'div'
 525             dest = copy.deepcopy(template_tree)
 526             dest.set('dir', self.dir)
 527             for e in dest.iterdescendants(config.TEMPLATING_REPLACED_ELEMENT):
 528                 #copy only if there are more than 2
 529                 if content.getparent() is not None:
 530                     content = copy.deepcopy(content)
 531                 e.getparent().replace(e, content)
 532
 533             chaptertitle = etree.Element('div', Class=config.TEMPLATING_CHAPTER_TITLE_ELEMENT)
 534             chaptertitle.text = title
 535             for e in template_tree.iterdescendants(config.TEMPLATING_CHAPTER_TITLE_ELEMENT):
 536                 e.getparent().replace(e, copy.deepcopy(chaptertitle))
 537             for e in dest.iterdescendants('title'):
 538                 #log(type(title), title)
 539                 e.text = title
 540             self.save_tempfile(os.path.join(destdir, filename), lxml.html.tostring(dest))
 541
 542
 543         #write the contents to a file. (either index.html or contents.html)
 544         save_content(contents, self.title, contents_name)
 545
 546         savename = first_name
 547         #and now write each chapter to a file
 548         for ID in self.spine:
 549             filename = self.manifest[ID]['url']
 550             try:
 551                 root = self.get_tree_by_id(ID).getroot()
 552                 body = root.find('body')
 553             except Exception, e:
 554                 log("hit %s when trying book.get_tree_by_id(%s).getroot().find('body')" % (e, ID))
 555                 body = etree.Element('body')
 556
 557             #handle any TOC points in this file.  There should only be one!
 558             for point in tocmap[filename]:
 559                 if point['type'] != 'booki-section':
 560                     title = point['title']
 561                     break
 562             else:
 563                 title = self.title
 564
 565             if savename is None:
 566                 savename = filename
 567             save_content(body, title, savename)
 568             savename = None
 569         log(destdir, self.publish_file)
 570         os.rename(destdir, self.publish_file)
 571         self.notify_watcher()
 572
 573
 574     def make_simple_pdf(self, mode):
 575         """Make a simple pdf document without contents or separate
 576         title page.  This is used for multicolumn newspapers and for
 577         web-destined pdfs."""
 578         self.wait_for_xvfb()
 579         #0. Add heading to begining of html
 580         body = list(self.tree.cssselect('body'))[0]
 581         e = body.makeelement('h1', {'id': 'book-title'})
 582         e.text = self.title.decode('utf-8')
 583         body.insert(0, e)
 584         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 585         e.addnext(intro)
 586
 587         #0.5 adjust parameters to suit the particular kind of output
 588         if mode == 'web':
 589             self.maker.gutter = 0
 590
 591         #1. Save the html
 592         html_text = etree.tostring(self.tree, method="html")
 593         save_data(self.body_html_file, html_text)
 594
 595         #2. Make a pdf of it (direct to to final pdf)
 596         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True, outline_file=self.outline_file)
 597         self.notify_watcher('generate_pdf')
 598         n_pages = count_pdf_pages(self.pdf_file)
 599
 600         if mode != 'web':
 601             #3. resize pages and shift gutters.
 602             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 603             self.notify_watcher('reshape_pdf')
 604
 605             #4. add page numbers
 606             self.maker.number_pdf(self.pdf_file, n_pages,
 607                                   dir=self.dir, numbers=self.page_numbers)
 608             self.notify_watcher("number_pdf")
 609         self.notify_watcher()
 610
 611
 612     def rotate180(self):
 613         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 614         presses."""
 615         rotated = self.filepath('final-rotate.pdf')
 616         unrotated = self.filepath('final-pre-rotate.pdf')
 617         #leave the unrotated pdf intact at first, in case of error.
 618         rotate_pdf(self.pdf_file, rotated)
 619         os.rename(self.pdf_file, unrotated)
 620         os.rename(rotated, self.pdf_file)
 621         self.notify_watcher()
 622
 623     def publish_pdf(self):
 624         """Move the finished PDF to its final resting place"""
 625         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 626         os.rename(self.pdf_file, self.publish_file)
 627         self.notify_watcher()
 628
 629     def publish_bookizip(self):
 630         """Publish the bookizip.  For this, copy rather than move,
 631         because the bookizip might be used by further processing.  If
 632         possible, a hard link is created."""
 633         log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
 634         try:
 635             run(['cp', '-l', self.bookizip_file, self.publish_file])
 636         except OSError:
 637             run(['cp', self.bookizip_file, self.publish_file])
 638         self.notify_watcher()
 639
 640     def concat_html(self):
 641         """Join all the chapters together into one tree.  Keep the TOC
 642         up-to-date along the way."""
 643
 644         #each manifest item looks like:
 645         #{'contributors': []
 646         #'license': [],
 647         #'mimetype': '',
 648         #'rightsholders': []
 649         #'url': ''}
 650         doc = lxml.html.document_fromstring('<html dir="%s"><body dir="%s"></body></html>'
 651                                             % (self.dir, self.dir))
 652         tocmap = filename_toc_map(self.toc)
 653         for ID in self.spine:
 654             details = self.manifest[ID]
 655             #log(ID, pformat(details))
 656             # ACO MIJENJAO
 657             try:
 658                 root = self.get_tree_by_id(ID).getroot()
 659             except Exception, e:
 660                 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
 661                 continue
 662             #handle any TOC points in this file
 663             for point in tocmap[details['url']]:
 664                 #if the url has a #identifier, use it. Otherwise, make
 665                 #one up, using a hidden element at the beginning of
 666                 #the inserted document.
 667                 #XXX this will break if different files use the same ids
 668                 #XXX should either replace all, or replace selectively.
 669                 if point['fragment']:
 670                     fragment = point['fragment']
 671                 else:
 672                     body = _find_tag(root, 'body')
 673                     fragment = '%s_%s' % (self.cookie, point['index'])
 674                     #reuse first tag if it is suitable.
 675                     if (len(body) and
 676                         body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
 677                         if body[0].get('id') is None:
 678                             body[0].set('id', fragment)
 679                         else:
 680                             fragment = body[0].get('id')
 681                         #the chapter starts with a heading. that heading should be the chapter name.
 682                         if body[0].tag in ('h1', 'h2', 'h3'):
 683                             #log('chapter has title "%s", found html title "%s"' %
 684                             #    (point['title'], body[0].text_content()))
 685                             point['html_title'] = body[0].text_content()
 686                     else:
 687                         marker = body.makeelement('div', style="display:none",
 688                                                   id=fragment)
 689                         body.insert(0, marker)
 690                 point['html_id'] = fragment
 691
 692             add_guts(root, doc)
 693         return doc
 694
 695     def unpack_static(self):
 696         """Extract static files from the zip for the html to refer to."""
 697         static_files = [x['url'] for x in self.manifest.values()
 698                         if x['url'].startswith('static')]
 699         if static_files:
 700             os.mkdir(self.filepath('static'))
 701
 702         for name in static_files:
 703             s = self.store.read(name)
 704             f = open(self.filepath(name), 'w')
 705             f.write(s)
 706             f.close()
 707         self.notify_watcher()
 708
 709     def load_book(self):
 710         """"""
 711         #XXX concatenate the HTML to match how TWiki version worked.
 712         # This is perhaps foolishly early -- throwing away useful boundaries.
 713         self.unpack_static()
 714         self.tree = self.concat_html()
 715         self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
 716
 717         self.headings = [x for x in self.tree.cssselect('h1')]
 718         if self.headings:
 719             self.headings[0].set('class', "first-heading")
 720         for h1 in self.headings:
 721             h1.title = h1.text_content().strip()
 722         self.notify_watcher()
 723
 724     def make_contents(self):
 725         """Generate HTML containing the table of contents.  This can
 726         only be done after the main PDF has been made, because the
 727         page numbers are contained in the PDF outline."""
 728         header = '<table class="toc">\n'
 729         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 730                     '<td class="pagenumber">%s</td></tr>\n')
 731         empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
 732         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 733         footer = '\n</table>'
 734
 735         contents = []
 736
 737         chapter = 1
 738         page_num = 1
 739         #log(self.outline_contents)
 740         outline_contents = iter(self.outline_contents)
 741
 742         for section in self.toc:
 743             if not section.get('children'):
 744                 contents.append(empty_section_tmpl % section['title'])
 745                 continue
 746             contents.append(section_tmpl % section['title'])
 747
 748             for point in section['children']:
 749                 try:
 750                     level = 99
 751                     while level > 1:
 752                         h1_text, level, page_num = outline_contents.next()
 753                 except StopIteration:
 754                     log("contents data not found for %s. Stopping" % (point,))
 755                     break
 756                 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
 757                 chapter += 1
 758
 759         doc = header + '\n'.join(contents) + footer
 760         if isinstance(doc, unicode):
 761             doc = doc.encode('utf-8')
 762         self.notify_watcher()
 763         return doc
 764
 765     def add_section_titles(self):
 766         """Add any section heading pages that the TOC.txt file
 767         specifies.  These are sub-book, super-chapter groupings.
 768
 769         Also add initial numbers to chapters.
 770         """
 771         chapter = 1
 772         section = None
 773         #log(self.toc)
 774         for t in self.toc:
 775             #only top level sections get a subsection page,
 776             #and only if they have children.
 777             if t.get('children'):
 778                 section = self.tree.makeelement('div', Class="objavi-subsection")
 779                 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
 780                 heading.text = t['title']
 781                 for child in t['children']:
 782                     item = etree.SubElement(section, 'div', Class="objavi-chapter")
 783                     if 'html_title' in child:
 784                         item.text = child['html_title']
 785                         heading = self.tree.cssselect('#'+ child['html_id'])
 786                         if heading:
 787                             _add_initial_number(heading[0], chapter)
 788                     else:
 789                         item.text = child['title']
 790                     _add_initial_number(item, chapter)
 791                     log(item.text, debug='HTMLGEN')
 792                     chapter += 1
 793                 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
 794                 location = self.tree.cssselect('#'+ t['html_id'])[0]
 795                 location.addprevious(section)
 796
 797
 798         self.notify_watcher()
 799
 800
 801     def add_css(self, css=None, mode='book'):
 802         """If css looks like a url, use it as a stylesheet link.
 803         Otherwise it is the CSS itself, which is saved to a temporary file
 804         and linked to."""
 805         log("css is %r" % css)
 806         htmltree = self.tree
 807         if css is None or not css.strip():
 808             css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
 809             if css_default is None:
 810                 #guess from language -- this should come first
 811                 css_modes = config.LANGUAGE_CSS.get(self.lang,
 812                                                     config.LANGUAGE_CSS['en'])
 813                 css_default = css_modes.get(mode, css_modes[None])
 814             url = css_default
 815         elif not re.match(r'^http://\S+$', css):
 816             url = path2url(self.save_tempfile('objavi.css', css), full=True)
 817         else:
 818             url = css
 819
 820         #find the head -- it's probably first child but lets not assume.
 821         for child in htmltree:
 822             if child.tag == 'head':
 823                 head = child
 824                 break
 825         else:
 826             head = htmltree.makeelement('head')
 827             htmltree.insert(0, head)
 828
 829         link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 830         self.css_url = url
 831         self.notify_watcher()
 832         return url
 833
 834
 835     def _read_localised_template(self, template, fallbacks=['en']):
 836         """Try to get the template in the approriate language, otherwise in english."""
 837         for lang in [self.lang] + fallbacks:
 838             try:
 839                 fn = template % (lang)
 840                 f = open(fn)
 841                 break
 842             except IOError, e:
 843                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 844                 log(e)
 845         template = f.read()
 846         f.close()
 847         return template
 848
 849     def compose_inside_cover(self):
 850         """create the markup for the preamble inside cover."""
 851         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 852
 853         if self.isbn:
 854             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 855         else:
 856             isbn_text = ''
 857
 858         return template % {'date': time.strftime('%Y-%m-%d'),
 859                            'isbn': isbn_text,
 860                            'license': self.license,
 861                            }
 862
 863
 864     def compose_end_matter(self):
 865         """create the markup for the end_matter inside cover.  If
 866         self.isbn is not set, the html will result in a pdf that
 867         spills onto two pages.
 868         """
 869         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 870
 871         d = {'css_url': self.css_url,
 872              'title': self.title
 873              }
 874
 875         if self.isbn:
 876             d['inside_cover_style'] = ''
 877         else:
 878             d['inside_cover_style'] = 'page-break-after: always'
 879
 880         return template % d
 881
 882
 883     def make_epub(self, use_cache=False):
 884         """Make an epub version of the book, using Mike McCabe's
 885         epub module for the Internet Archive."""
 886         ebook = ia_epub.Book(self.publish_file, content_dir='')
 887         def add_file(ID, filename, mediatype, content):
 888             ebook.add_content({'media-type': mediatype.encode('utf-8'),
 889                                'id': ID.encode('utf-8'),
 890                                'href': filename.encode('utf-8'),
 891                                }, content)
 892
 893         toc = self.info['TOC']
 894
 895         #manifest
 896         filemap = {} #map html to corresponding xhtml
 897         spinemap = {} #map IDs to multi-file chapters
 898         for ID in self.manifest:
 899             details = self.manifest[ID]
 900             #log(ID, pformat(details))
 901             fn, mediatype = details['url'], details['mimetype']
 902             content = self.store.read(fn)
 903             if mediatype == 'text/html':
 904                 #convert to application/xhtml+xml, and perhaps split
 905                 c = EpubChapter(self.server, self.book, ID, content,
 906                                 use_cache=use_cache)
 907                 c.remove_bad_tags()
 908                 if fn[-5:] == '.html':
 909                     fnbase = fn[:-5]
 910                 else:
 911                     fnbase = fn
 912                 fnx = fnbase + '.xhtml'
 913                 mediatype = 'application/xhtml+xml'
 914
 915                 fragments = split_html(c.as_xhtml(),
 916                                        compressed_size=self.store.getinfo(fn).compress_size)
 917
 918                 #add the first one as if it is the whole thing (as it often is)
 919                 add_file(ID, fnx, mediatype, fragments[0])
 920                 filemap[fn] = fnx
 921                 if len(fragments) > 1:
 922                     spine_ids = [ID]
 923                     spinemap[ID] = spine_ids
 924                     #add any extras
 925                     for i in range(1, len(fragments)):
 926                         # XXX it is possible for duplicates if another
 927                         # file happens to have this name. Ignore for now
 928                         _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
 929                         spine_ids.append(_id)
 930                         add_file(_id,
 931                                  '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
 932                                  mediatype, fragments[i])
 933
 934             else:
 935                 add_file(ID, fn, mediatype, content)
 936
 937         #toc
 938         ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
 939         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 940
 941         #spine
 942         for ID in self.spine:
 943             if ID in spinemap:
 944                 for x in spinemap[ID]:
 945                     ebook.add_spine_item({'idref': x})
 946             else:
 947                 ebook.add_spine_item({'idref': ID})
 948
 949         #metadata -- no use of attributes (yet)
 950         # and fm: metadata disappears for now
 951         DCNS = config.DCNS
 952         DC = config.DC
 953         meta_info_items = []
 954         for ns, namespace in self.metadata.items():
 955             for keyword, schemes in namespace.items():
 956                 if ns:
 957                     keyword = '{%s}%s' % (ns, keyword)
 958                 for scheme, values in schemes.items():
 959                     for value in values:
 960                         item = {
 961                             'item': keyword,
 962                             'text': value,
 963                             }
 964                         if scheme:
 965                             if keyword in (DCNS + 'creator', DCNS + 'contributor'):
 966                                 item['atts'] = {'role': scheme}
 967                             else:
 968                                 item['atts'] = {'scheme': scheme}
 969
 970         has_authors = 'creator' in self.metadata[DC]
 971         if not has_authors and config.CLAIM_UNAUTHORED:
 972             authors = []
 973             for x in self.metadata[DC]['creator'].values():
 974                 authors.extend(x)
 975
 976             meta_info_items.append({'item': DCNS + 'creator',
 977                                     'text': 'The Contributors'})
 978
 979             meta_info_items.append({'item': DCNS + 'rights',
 980                                     'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 981                                    )
 982
 983         tree_str = ia_epub.make_opf(meta_info_items,
 984                                     ebook.manifest_items,
 985                                     ebook.spine_items,
 986                                     ebook.guide_items,
 987                                     ebook.cover_id)
 988         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 989         ebook.z.close()
 990         self.notify_watcher()
 991
 992
 993     def publish_s3(self):
 994         """Push the book's epub to archive.org, using S3."""
 995         #XXX why only epub?
 996         secrets = {}
 997         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 998             fn = getattr(config, x)
 999             f = open(fn)
1000             secrets[x] = f.read().strip()
1001             f.close()
1002
1003         now = time.strftime('%F')
1004         s3output = self.filepath('s3-output.txt')
1005         s3url, detailsurl = find_archive_urls(self.book, self.bookname)
1006         headers = [
1007             'x-amz-auto-make-bucket:1',
1008             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
1009             'x-archive-meta-mediatype:texts',
1010             'x-archive-meta-collection:opensource',
1011             'x-archive-meta-title:%s' % (self.book,),
1012             'x-archive-meta-date:%s' % (now,),
1013             'x-archive-meta-creator:FLOSS Manuals Contributors',
1014             ]
1015
1016         if self.license in config.LICENSES:
1017             headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
1018
1019         argv = ['curl', '--location', '-s', '-o', s3output]
1020         for h in headers:
1021             argv.extend(('--header', h))
1022         argv.extend(('--upload-file', self.publish_file, s3url,))
1023
1024         log(' '.join(repr(x) for x in argv))
1025         check_call(argv, stdout=sys.stderr)
1026         self.notify_watcher()
1027         return detailsurl, s3url
1028
1029     def publish_shared(self, group=None, user=None):
1030         """Make symlinks from the BOOKI_SHARED_DIRECTORY to the
1031         published file, so that a virtual host can be set up to
1032         publish the files from a static location.  If group is set, it
1033         is used as a subdirectory, otherwise a virtual group like
1034         'lonely-user-XXX' is used."""
1035         if group is None:
1036             if user is None:
1037                 return
1038             group = config.BOOKI_SHARED_LONELY_USER_PREFIX + user
1039         group = group.replace('..', '+').replace('/', '+')
1040         group = re.sub("[^\w%.,-]+", "_", group)[:250]
1041         groupdir = os.path.join(config.BOOKI_SHARED_DIRECTORY, group)
1042
1043         generic_name = re.sub(r'-\d{4}\.\d\d\.\d\d\-\d\d\.\d\d\.\d\d', '', self.bookname)
1044         log(self.bookname, generic_name)
1045
1046         if not os.path.exists(groupdir):
1047             os.mkdir(groupdir)
1048
1049         #change directory, for least symlink confusion
1050         pwd = os.getcwd()
1051         os.chdir(groupdir)
1052         if os.path.exists(generic_name):
1053             os.unlink(generic_name)
1054         os.symlink(os.path.abspath(self.publish_file), generic_name)
1055         os.chdir(pwd)
1056
1057
1058     def spawn_x(self):
1059         """Start an Xvfb instance, using a new server number.  A
1060         reference to it is stored in self.xvfb, which is used to kill
1061         it when the pdf is done.
1062
1063         Note that Xvfb doesn't interact well with dbus which is
1064         present on modern desktops.
1065         """
1066         #Find an unused server number (in case two cgis are running at once)
1067         while True:
1068             servernum = random.randrange(50, 500)
1069             if not os.path.exists('/tmp/.X%s-lock' % servernum):
1070                 break
1071
1072         self.xserver_no = ':%s' % servernum
1073
1074         authfile = self.filepath('Xauthority')
1075         os.environ['XAUTHORITY'] = authfile
1076
1077         #mcookie(1) eats into /dev/random, so avoid that
1078         from hashlib import md5
1079         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
1080         mcookie = m.hexdigest()
1081
1082         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
1083
1084         self.xvfb = Popen(['Xvfb', self.xserver_no,
1085                            '-screen', '0', '1024x768x24',
1086                            '-pixdepths', '32',
1087                            #'-blackpixel', '0',
1088                            #'-whitepixel', str(2 ** 24 -1),
1089                            #'+extension', 'Composite',
1090                            '-dpi', '96',
1091                            #'-kb',
1092                            '-nolisten', 'tcp',
1093                            ])
1094
1095         # We need to wait a bit before the Xvfb is ready.  but the
1096         # downloads are so slow that that probably doesn't matter
1097
1098         self.xvfb_ready_time = time.time() + 2
1099
1100         os.environ['DISPLAY'] = self.xserver_no
1101         log(self.xserver_no)
1102
1103     def wait_for_xvfb(self):
1104         """wait until a previously set time before continuing.  This
1105         is so Xvfb has time to properly start."""
1106         if hasattr(self, 'xvfb'):
1107             d = self.xvfb_ready_time - time.time()
1108             if d > 0:
1109                 time.sleep(d)
1110                 self.notify_watcher()
1111
1112     def cleanup_x(self):
1113         """Try very hard to kill off Xvfb.  In addition to killing
1114         this instance's xvfb, occasionally (randomly) search for
1115         escaped Xvfb instances and kill those too."""
1116         if not hasattr(self, 'xvfb'):
1117             return
1118         check_call(['xauth', 'remove', self.xserver_no])
1119         p = self.xvfb
1120         log("trying to kill Xvfb %s" % p.pid)
1121         os.kill(p.pid, 15)
1122         for i in range(10):
1123             if p.poll() is not None:
1124                 log("%s died with %s" % (p.pid, p.poll()))
1125                 break
1126             log("%s not dead yet" % p.pid)
1127             time.sleep(0.2)
1128         else:
1129             log("Xvfb would not die! kill -9! kill -9!")
1130             try:
1131                 os.kill(p.pid, 9)
1132             except OSError, e:
1133                 log(e)
1134
1135         if random.random() < 0.1:
1136             # occasionally kill old xvfbs and soffices, if there are any.
1137             self.kill_old_processes()
1138
1139     def kill_old_processes(self):
1140         """Sometimes, despite everything, Xvfb or soffice instances
1141         hang around well after they are wanted -- for example if the
1142         cgi process dies particularly badly. So kill them if they have
1143         been running for a long time."""
1144         log("running kill_old_processes")
1145         killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1146                                    os.path.basename(config.HTML2ODT),
1147                                    os.path.basename(config.WKHTMLTOPDF),
1148                                    ])
1149         p = Popen(['ps', '-C', killable_names,
1150                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
1151         data = p.communicate()[0].strip()
1152         if data:
1153             lines = data.split('\n')
1154             pids = []
1155             for line in lines:
1156                 log('dealing with ps output "%s"' % line)
1157                 try:
1158                     pid, days, hours, minutes, seconds \
1159                          = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1160                 except AttributeError:
1161                     log("Couldn't parse that line!")
1162                 # 50 minutes should be enough xvfb time for anyone
1163                 if days or hours or int(minutes) > 50:
1164                     pid = int(pid)
1165                     log("going to kill pid %s" % pid)
1166                     os.kill(pid, 15)
1167                     pids.append(pid)
1168
1169             time.sleep(1.0)
1170             for pid in pids:
1171                 #try again in case any are lingerers
1172                 try:
1173                     os.kill(int(pid), 9)
1174                 except OSError, e:
1175                     log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1176                     continue
1177                 log('killing %s with -9' % pid)
1178         self.notify_watcher()
1179
1180     def cleanup(self):
1181         self.cleanup_x()
1182         if not config.KEEP_TEMP_FILES:
1183             for fn in os.listdir(self.workdir):
1184                 os.remove(os.path.join(self.workdir, fn))
1185             os.rmdir(self.workdir)
1186         else:
1187             log("NOT removing '%s', containing the following files:" % self.workdir)
1188             log(*os.listdir(self.workdir))
1189
1190         self.notify_watcher()
1191
1192
1193 def use_cache():
1194     return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1195
1196 def _read_cached_zip(server, book, max_age):
1197     #find a recent zip if possible
1198     prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1199     from glob import glob
1200     zips = sorted(glob(prefix + '*.zip'))
1201     if not zips:
1202         log("no cached booki-zips matching %s*.zip" % (prefix,))
1203         return None
1204     zipname = zips[-1]
1205     cutoff = time.time() - max_age * 60
1206     log(repr(zipname))
1207     try:
1208         date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1209         if date > cutoff:
1210             f = open(zipname)
1211             blob = f.read()
1212             f.close()
1213             return blob, zipname
1214         log("%s is too old, must reload" % zipname)
1215         return None
1216     except (IOError, IndexError, ValueError), e:
1217         log('could not make sense of %s: got exception %s' % (zipname, e))
1218         return None
1219
1220
1221 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1222     interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1223     try:
1224         url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1225                                             'server': server, 'book':book}
1226     except KeyError:
1227         raise NotImplementedError("Can't handle '%s' interface" % interface)
1228
1229     if use_cache() and max_age < 0:
1230         #default to 12 hours cache on objavi.halo.gen.nz
1231         max_age = 12 * 60
1232
1233     if max_age:
1234         log('WARNING: trying to use cached booki-zip',
1235             'If you are debugging booki-zip creation, you will go CRAZY'
1236             ' unless you switch this off')
1237         blob_and_name = _read_cached_zip(server, book, max_age)
1238         if blob_and_name is not None:
1239             return blob_and_name
1240
1241     log('fetching zip from %s'% url)
1242     f = urlopen(url)
1243     blob = f.read()
1244     f.close()
1245     if save:
1246         if filename is None:
1247             filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1248                                   make_book_name(book, server, '.zip'))
1249         f = open(filename, 'w')
1250         f.write(blob)
1251         f.close()
1252     return blob, filename
1253
1254
1255 def split_html(html, compressed_size=None, fix_markup=False):
1256     """Split long html files into pieces that will work nicely on a
1257     Sony Reader."""
1258     if compressed_size is None:
1259         import zlib
1260         compressed_size = len(zlib.compress(html))
1261
1262     splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1263                  len(html) // config.EPUB_FILE_SIZE_MAX)
1264     log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1265
1266     if not splits:
1267         return [html]
1268
1269     if fix_markup:
1270         #remove '<' in attributes etc, which makes the marker
1271         #insertion more reliable
1272         html = etree.tostring(lxml.html.fromstring(html),
1273                               encoding='UTF-8',
1274                               #method='html'
1275                               )
1276
1277     target = len(html) // (splits + 1)
1278     s = 0
1279     fragments = []
1280     for i in range(splits):
1281         e = html.find('<', target * (i + 1))
1282         fragments.append(html[s:e])
1283         fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1284         s = e
1285     fragments.append(html[s:])
1286
1287     #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1288     chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1289     return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]
1290