objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 import copy
  28 from subprocess import Popen, check_call, PIPE
  29 from cStringIO import StringIO
  30 from urllib2 import urlopen, HTTPError
  31 import zipfile
  32 import traceback
  33 from string import ascii_letters
  34 from pprint import pformat
  35
  36 try:
  37     import json
  38 except ImportError:
  39     import simplejson as json
  40
  41 import lxml.html
  42 from lxml import etree
  43
  44 from objavi import config, epub_utils
  45 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
  46 from objavi.book_utils import ObjaviError, log_types
  47 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline, parse_extracted_outline
  48 from objavi.epub import add_guts, _find_tag
  49 from objavi.xhtml_utils import EpubChapter, split_tree
  50 from objavi.cgi_utils import url2path, path2url
  51
  52 from iarchive import epub as ia_epub
  53 from booki.bookizip import get_metadata, add_metadata
  54
  55 TMPDIR = os.path.abspath(config.TMPDIR)
  56 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
  57 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  58
  59 def find_archive_urls(bookid, bookname):
  60     s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
  61     detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
  62     return (s3url, detailsurl)
  63
  64 def _get_best_title(tocpoint):
  65     if 'html_title' in tocpoint:
  66         return tocpoint['html_title']
  67     if 'title' in tocpoint:
  68         return tocpoint['title']
  69     return 'Untitled'
  70
  71
  72 def _add_initial_number(e, n):
  73     """Put a styled chapter number n at the beginning of element e."""
  74     initial = e.makeelement("strong", Class="initial")
  75     e.insert(0, initial)
  76     initial.tail = ' '
  77     if e.text is not None:
  78         initial.tail += e.text
  79     e.text = ''
  80     initial.text = "%s." % n
  81
  82 def expand_toc(toc, depth=1, index=0):
  83     """Reformat toc slightly for convenience"""
  84     for item in toc:
  85         url = item['url'].lstrip('/')
  86         bits = url.split('#', 1)
  87         filename = bits[0]
  88         fragment = (bits[1] if len(bits) == 2 else None)
  89         item['depth'] = depth
  90         item["filename"] = filename
  91         item["fragment"] = fragment
  92         item["index"] = index
  93         index += 1
  94         if 'children' in item:
  95             index = expand_toc(item['children'], depth + 1, index)
  96     return index
  97
  98 def _serialise(rtoc, stoc, depth):
  99     for item in rtoc:
 100         url = item['url'].lstrip('/')
 101         bits = url.split('#', 1)
 102         filename = bits[0]
 103         fragment = (bits[1] if len(bits) == 2 else None)
 104         stoc.append({"depth": depth,
 105                      "title": item['title'],
 106                      "url": url,
 107                      "filename": filename,
 108                      "fragment": fragment,
 109                      "type": item['type']
 110                      })
 111         if 'children' in item:
 112             _serialise(item['children'], stoc, depth + 1)
 113
 114
 115 def serialise_toc(rtoc):
 116     """Take the recursive TOC structure and turn it into a list of
 117     serial points.  Reformat some things for convenience."""
 118     stoc = []
 119     _serialise(rtoc, stoc, 1)
 120     for i, x in enumerate(stoc):
 121         x['position'] = i
 122     return stoc
 123
 124 def filename_toc_map(rtoc):
 125     tocmap = {}
 126     #log(rtoc)
 127     def traverse(toc):
 128         for point in toc:
 129             #log(point.keys())
 130             tocmap.setdefault(point['filename'], []).append(point)
 131             if 'children' in point:
 132                 traverse(point['children'])
 133     traverse(rtoc)
 134     return tocmap
 135
 136 def save_data(fn, data):
 137     """Save without tripping up on unicode"""
 138     if isinstance(data, unicode):
 139         data = data.encode('utf8', 'ignore')
 140     f = open(fn, 'w')
 141     f.write(data)
 142     f.close()
 143
 144
 145 class Book(object):
 146     page_numbers = 'latin'
 147     preamble_page_numbers = 'roman'
 148
 149     def notify_watcher(self, message=None):
 150         if self.watchers:
 151             if  message is None:
 152                 #message is the name of the caller
 153                 message = traceback.extract_stack(None, 2)[0][2]
 154             log("notify_watcher called with '%s'" % message)
 155             for w in self.watchers:
 156                 w(message)
 157
 158     def __enter__(self):
 159         return self
 160
 161     def __exit__(self, exc_type, exc_value, tb):
 162         self.notify_watcher(config.FINISHED_MESSAGE)
 163         self.cleanup()
 164         #could deal with exceptions here and return true
 165
 166
 167     def __init__(self, book, server, bookname,
 168                  page_settings=None, watchers=None, isbn=None,
 169                  license=config.DEFAULT_LICENSE, title=None,
 170                  max_age=0):
 171         log("*** Starting new book %s ***" % bookname)
 172         self.watchers = set()
 173         if watchers is not None:
 174             self.watchers.update(watchers)
 175         self.notify_watcher('start')
 176         self.bookname = bookname
 177         self.book = book
 178         self.server = server
 179         self.cookie = ''.join(random.sample(ascii_letters, 10))
 180         try:
 181             blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
 182         except HTTPError, e:
 183             traceback.print_exc()
 184             self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
 185             #not much to do?
 186             #raise 502 Bad Gateway ?
 187             sys.exit()
 188         f = StringIO(blob)
 189         self.notify_watcher('fetch_zip')
 190         self.store = zipfile.ZipFile(f, 'r')
 191         self.info = json.loads(self.store.read('info.json'))
 192         for k in ('manifest', 'metadata', 'spine', 'TOC'):
 193             if k not in self.info:
 194                 raise ObjaviError('info.json of %s lacks vital element "%s"' %
 195                                   (bookname, k))
 196             #check types also?
 197
 198         self.metadata = self.info['metadata']
 199         self.spine = self.info['spine']
 200         self.manifest = self.info['manifest']
 201
 202         if server == config.LOCALHOST: # [DEPRECATED]
 203             server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
 204             book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
 205
 206         log(pformat(self.metadata))
 207         self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
 208         if not self.lang:
 209             self.lang = guess_lang(server, book)
 210             log('guessed lang as %s' % self.lang)
 211
 212         self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
 213         if not self.toc_header:
 214             self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
 215
 216         self.dir = str(get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0])
 217         if not self.dir:
 218             self.dir = guess_text_dir(server, book)
 219
 220         #Patch in the extra metadata. (lang and dir may be set from config)
 221         #these should be read from zip -- so should go into zip?
 222         for var, key, scheme, ns in (
 223             (isbn, 'id', 'ISBN', config.DC),
 224             (license, 'rights', 'License', config.DC),
 225             (title, 'title', '', config.DC),
 226             (self.lang, 'language', '', config.DC),
 227             (self.dir, 'dir', '', config.FM),
 228             ):
 229             if var is not None:
 230                 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
 231
 232         self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
 233         self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
 234
 235         self.toc = self.info['TOC']
 236         expand_toc(self.toc)
 237
 238         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 239         os.chmod(self.workdir, 0755)
 240
 241         self.body_html_file = self.filepath('body.html')
 242         self.body_pdf_file = self.filepath('body.pdf')
 243         self.preamble_html_file = self.filepath('preamble.html')
 244         self.preamble_pdf_file = self.filepath('preamble.pdf')
 245         self.tail_html_file = self.filepath('tail.html')
 246         self.tail_pdf_file = self.filepath('tail.pdf')
 247         self.isbn_pdf_file = None
 248         self.pdf_file = self.filepath('final.pdf')
 249         self.body_odt_file = self.filepath('body.odt')
 250         self.outline_file = self.filepath('outline.txt')
 251
 252         self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
 253
 254         if page_settings is not None:
 255             self.maker = PageSettings(**page_settings)
 256
 257         if title is not None:
 258             self.title = title
 259         else:
 260             titles = get_metadata(self.metadata, 'title')
 261             if titles:
 262                 self.title = titles[0]
 263             else:
 264                 self.title = 'A Book About ' + self.book
 265         if isinstance(self.title, unicode):
 266             self.title = self.title.encode('utf-8')
 267
 268         self.notify_watcher()
 269
 270
 271     if config.TRY_BOOK_CLEANUP_ON_DEL:
 272         #Dont even define __del__ if it is not used.
 273         _try_cleanup_on_del = True
 274         def __del__(self):
 275             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 276                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 277                 self.cleanup()
 278
 279     def get_tree_by_id(self, id):
 280         """get an HTML tree from the given manifest ID"""
 281         name = self.manifest[id]['url']
 282         mimetype = self.manifest[id]['mimetype']
 283         s = self.store.read(name)
 284         f = StringIO(s)
 285         if mimetype == 'text/html':
 286             try:
 287                 tree = lxml.html.parse(f)
 288             except etree.XMLSyntaxError, e:
 289                 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
 290                     (id, name, s[:20], e))
 291                 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 292         elif 'xml' in mimetype: #XXX or is this just asking for trouble?
 293             tree = etree.parse(f)
 294         else:
 295             tree = f.read()
 296         f.close()
 297         return tree
 298
 299     def filepath(self, fn):
 300         return os.path.join(self.workdir, fn)
 301
 302     def save_tempfile(self, fn, data):
 303         """Save the data in a temporary directory that will be cleaned
 304         up when all is done.  Return the absolute file path."""
 305         fn = self.filepath(fn)
 306         save_data(fn, data)
 307         return fn
 308
 309     def make_oo_doc(self):
 310         """Make an openoffice document, using the html2odt script."""
 311         self.wait_for_xvfb()
 312         html_text = etree.tostring(self.tree, method="html")
 313         save_data(self.body_html_file, html_text)
 314         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 315         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 316         os.rename(self.body_odt_file, self.publish_file)
 317         self.notify_watcher()
 318
 319     def extract_pdf_outline(self):
 320         """Get the outline (table of contents) for the PDF, which
 321         wkhtmltopdf should have written to a file.  If that file
 322         doesn't exist (or config says not to use it), fall back to
 323         using self._extract_pdf_outline_the_old_way, below.
 324         """
 325         if config.USE_DUMP_OUTLINE:
 326             try:
 327                 self.outline_contents, number_of_pages = \
 328                                        parse_extracted_outline(self.outline_file)
 329
 330             except Exception, e:
 331                 traceback.print_exc()
 332                 number_of_pages = self._extract_pdf_outline_the_old_way()
 333         else:
 334             number_of_pages = self._extract_pdf_outline_the_old_way()
 335
 336         self.notify_watcher()
 337         return number_of_pages
 338
 339     def _extract_pdf_outline_the_old_way(self):
 340         """Try to get the PDF outline using pdftk.  This doesn't work
 341         well with all scripts."""
 342         debugf = self.filepath('extracted-outline.txt')
 343         self.outline_contents, number_of_pages = \
 344                 parse_outline(self.body_pdf_file, 1, debugf)
 345
 346         if not self.outline_contents:
 347             #probably problems with international text. need a horrible hack
 348             log('no outline: trying again with ascii headings')
 349             import copy
 350             tree = copy.deepcopy(self.tree)
 351             titlemap = {}
 352             for tag in ('h1', 'h2', 'h3', 'h4'):
 353                 for i, e in enumerate(tree.getiterator(tag)):
 354                     key = "%s_%s" % (tag, i)
 355                     titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
 356                     del e[:]
 357                     if tag == 'h1':
 358                         e = lxml.etree.SubElement(e, "strong", Class="initial")
 359                     e.text = key
 360                     log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
 361
 362             ascii_html_file = self.filepath('body-ascii-headings.html')
 363             ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
 364             html_text = lxml.etree.tostring(tree, method="html")
 365             save_data(ascii_html_file, html_text)
 366             self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
 367             debugf = self.filepath('ascii-extracted-outline.txt')
 368             ascii_contents, number_of_ascii_pages = \
 369                 parse_outline(ascii_pdf_file, 1, debugf)
 370             self.outline_contents = []
 371             log ("number of pages: %s, post ascii: %s" %
 372                  (number_of_pages, number_of_ascii_pages))
 373             for ascii_title, depth, pageno in ascii_contents:
 374                 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
 375                     ascii_title = ascii_title[:-4]
 376                 if ' ' in ascii_title:
 377                     ascii_title = ascii_title.rsplit(' ', 1)[1]
 378                 title = titlemap.get(ascii_title, '')
 379                 log((ascii_title, title, depth, pageno))
 380
 381                 self.outline_contents.append((title, depth, pageno))
 382
 383         return number_of_pages
 384
 385     def make_body_pdf(self):
 386         """Make a pdf of the HTML, using webkit"""
 387         #1. Save the html
 388         html_text = etree.tostring(self.tree, method="html")
 389         save_data(self.body_html_file, html_text)
 390
 391         #2. Make a pdf of it
 392         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True, outline_file=self.outline_file)
 393         self.notify_watcher('generate_pdf')
 394
 395         n_pages = self.extract_pdf_outline()
 396
 397         log ("found %s pages in pdf" % n_pages)
 398         #4. resize pages, shift gutters, even pages
 399         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 400         self.notify_watcher('reshape_pdf')
 401
 402         #5 add page numbers
 403         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 404                               numbers=self.page_numbers)
 405         self.notify_watcher("number_pdf")
 406         self.notify_watcher()
 407
 408     def make_preamble_pdf(self):
 409         contents = self.make_contents()
 410         inside_cover_html = self.compose_inside_cover()
 411         log_types(self.dir, self.css_url, self.title, inside_cover_html,
 412                   self.toc_header, contents, self.title)
 413
 414         html = ('<html dir="%s"><head>\n'
 415                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 416                 '<link rel="stylesheet" href="%s" />\n'
 417                 '</head>\n<body>\n'
 418                 '<h1 class="frontpage">%s</h1>'
 419                 '%s\n'
 420                 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
 421                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 422                 '<!--%s--></div></body></html>'
 423                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 424                      self.toc_header, contents, self.title)
 425         save_data(self.preamble_html_file, html)
 426
 427         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 428
 429         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 430
 431         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 432                             numbers=self.preamble_page_numbers,
 433                             number_start=-2)
 434
 435         self.notify_watcher()
 436
 437     def make_end_matter_pdf(self):
 438         """Make an inside back cover and a back cover.  If there is an
 439         isbn number its barcode will be put on the back cover."""
 440         if self.isbn:
 441             self.isbn_pdf_file = self.filepath('isbn.pdf')
 442             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 443             self.notify_watcher('make_barcode_pdf')
 444
 445         end_matter = self.compose_end_matter()
 446         #log(end_matter)
 447         save_data(self.tail_html_file, end_matter.decode('utf-8'))
 448         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 449
 450         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 451                                centre_end=True, even_pages=False)
 452         self.notify_watcher()
 453
 454     def make_book_pdf(self):
 455         """A convenient wrapper of a few necessary steps"""
 456         # now the Xvfb server is needed. make sure it has had long enough to get going
 457         self.wait_for_xvfb()
 458         self.make_body_pdf()
 459         self.make_preamble_pdf()
 460         self.make_end_matter_pdf()
 461
 462         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 463                     self.body_pdf_file, self.tail_pdf_file,
 464                     self.isbn_pdf_file)
 465
 466         self.notify_watcher('concatenated_pdfs')
 467
 468     def make_templated_html(self, template=None, zip=False, index=config.TEMPLATING_INDEX_FIRST):
 469         """Make a templated html version of the book."""
 470         #set up the directory and static files
 471         self.unpack_static()
 472         destdir = self.filepath('html')
 473         os.mkdir(destdir)
 474         os.rename(self.filepath('static'), self.filepath(os.path.join(destdir, 'static')))
 475
 476         if not template:
 477             template_tree = lxml.html.parse(config.TEMPLATING_DEFAULT_TEMPLATE).getroot()
 478         else:
 479             template_tree = lxml.html.document_fromstring(template)
 480
 481         tocmap = filename_toc_map(self.toc)
 482         contents_name, first_name = config.TEMPLATING_INDEX_MODES[index]
 483
 484         #build a contents page and a contents menu
 485         #We can't make this in the same pass because the menu needs to
 486         #go in every page (i.e., into the template)
 487         menu = etree.Element('ul', Class=config.TEMPLATING_MENU_ELEMENT)
 488         contents = etree.Element('div', Class=config.TEMPLATING_REPLACED_ELEMENT)
 489
 490         booktitle = etree.Element('div', Class=config.TEMPLATING_BOOK_TITLE_ELEMENT)
 491         log(self.title)
 492         booktitle.text = self.title.decode('utf-8')
 493
 494         etree.SubElement(contents, 'h1').text = self.title.decode('utf-8')
 495
 496         savename = first_name
 497         for ID in self.spine:
 498             filename = self.manifest[ID]['url']
 499             #handle any TOC points in this file.
 500             for point in tocmap[filename]:
 501                 if point['type'] == 'booki-section':
 502                     etree.SubElement(contents, 'h2').text = point['title']
 503                     etree.SubElement(menu, 'li', Class='booki-section').text = point['title']
 504                 else:
 505                     if savename is None:
 506                         savename = filename
 507                     div = etree.SubElement(contents, 'div')
 508                     etree.SubElement(div, 'a', href=savename).text = point['title']
 509                     li = etree.SubElement(menu, 'li')
 510                     li.tail = '\n'
 511                     etree.SubElement(li, 'a', href=savename).text = point['title']
 512                     savename = None
 513         #put the menu and book title into the template (if it wants it)
 514         for e in template_tree.iterdescendants(config.TEMPLATING_MENU_ELEMENT):
 515             e.getparent().replace(e, copy.deepcopy(menu))
 516         for e in template_tree.iterdescendants(config.TEMPLATING_BOOK_TITLE_ELEMENT):
 517             e.getparent().replace(e, copy.deepcopy(booktitle))
 518
 519         #function to template content and write to disk
 520         def save_content(content, title, filename):
 521             if not isinstance(title, unicode):
 522                 title = title.decode('utf-8')
 523             content.set('id', config.TEMPLATING_CONTENTS_ID)
 524             content.tag = 'div'
 525             dest = copy.deepcopy(template_tree)
 526             dest.set('dir', self.dir)
 527             for e in dest.iterdescendants(config.TEMPLATING_REPLACED_ELEMENT):
 528                 #copy only if there are more than 2
 529                 if content.getparent() is not None:
 530                     content = copy.deepcopy(content)
 531                 e.getparent().replace(e, content)
 532
 533             chaptertitle = etree.Element('div', Class=config.TEMPLATING_CHAPTER_TITLE_ELEMENT)
 534             chaptertitle.text = title
 535             for e in template_tree.iterdescendants(config.TEMPLATING_CHAPTER_TITLE_ELEMENT):
 536                 e.getparent().replace(e, copy.deepcopy(chaptertitle))
 537             for e in dest.iterdescendants('title'):
 538                 #log(type(title), title)
 539                 e.text = title
 540             self.save_tempfile(os.path.join(destdir, filename), lxml.html.tostring(dest))
 541
 542
 543         #write the contents to a file. (either index.html or contents.html)
 544         save_content(contents, self.title, contents_name)
 545
 546         savename = first_name
 547         #and now write each chapter to a file
 548         for ID in self.spine:
 549             filename = self.manifest[ID]['url']
 550             try:
 551                 root = self.get_tree_by_id(ID).getroot()
 552                 body = root.find('body')
 553             except Exception, e:
 554                 log("hit %s when trying book.get_tree_by_id(%s).getroot().find('body')" % (e, ID))
 555                 body = etree.Element('body')
 556
 557             #handle any TOC points in this file.  There should only be one!
 558             for point in tocmap[filename]:
 559                 if point['type'] != 'booki-section':
 560                     title = point['title']
 561                     break
 562             else:
 563                 title = self.title
 564
 565             if savename is None:
 566                 savename = filename
 567             save_content(body, title, savename)
 568             savename = None
 569         log(destdir, self.publish_file)
 570         os.rename(destdir, self.publish_file)
 571         self.notify_watcher()
 572
 573
 574     def make_simple_pdf(self, mode):
 575         """Make a simple pdf document without contents or separate
 576         title page.  This is used for multicolumn newspapers and for
 577         web-destined pdfs."""
 578         self.wait_for_xvfb()
 579         #0. Add heading to begining of html
 580         body = list(self.tree.cssselect('body'))[0]
 581         e = body.makeelement('h1', {'id': 'book-title'})
 582         e.text = self.title
 583         body.insert(0, e)
 584         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 585         e.addnext(intro)
 586
 587         #0.5 adjust parameters to suit the particular kind of output
 588         if mode == 'web':
 589             self.maker.gutter = 0
 590
 591         #1. Save the html
 592         html_text = etree.tostring(self.tree, method="html")
 593         save_data(self.body_html_file, html_text)
 594
 595         #2. Make a pdf of it (direct to to final pdf)
 596         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True, outline_file=self.outline_file)
 597         self.notify_watcher('generate_pdf')
 598         n_pages = count_pdf_pages(self.pdf_file)
 599
 600         if mode != 'web':
 601             #3. resize pages and shift gutters.
 602             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 603             self.notify_watcher('reshape_pdf')
 604
 605             #4. add page numbers
 606             self.maker.number_pdf(self.pdf_file, n_pages,
 607                                   dir=self.dir, numbers=self.page_numbers)
 608             self.notify_watcher("number_pdf")
 609         self.notify_watcher()
 610
 611
 612     def rotate180(self):
 613         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 614         presses."""
 615         rotated = self.filepath('final-rotate.pdf')
 616         unrotated = self.filepath('final-pre-rotate.pdf')
 617         #leave the unrotated pdf intact at first, in case of error.
 618         rotate_pdf(self.pdf_file, rotated)
 619         os.rename(self.pdf_file, unrotated)
 620         os.rename(rotated, self.pdf_file)
 621         self.notify_watcher()
 622
 623     def publish_pdf(self):
 624         """Move the finished PDF to its final resting place"""
 625         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 626         os.rename(self.pdf_file, self.publish_file)
 627         self.notify_watcher()
 628
 629     def publish_bookizip(self):
 630         """Publish the bookizip.  For this, copy rather than move,
 631         because the bookizip might be used by further processing.  If
 632         possible, a hard link is created."""
 633         log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
 634         try:
 635             run(['cp', '-l', self.bookizip_file, self.publish_file])
 636         except OSError:
 637             run(['cp', self.bookizip_file, self.publish_file])
 638         self.notify_watcher()
 639
 640     def concat_html(self):
 641         """Join all the chapters together into one tree.  Keep the TOC
 642         up-to-date along the way."""
 643
 644         #each manifest item looks like:
 645         #{'contributors': []
 646         #'license': [],
 647         #'mimetype': '',
 648         #'rightsholders': []
 649         #'url': ''}
 650         doc = lxml.html.document_fromstring('<html><body></body></html>')
 651         tocmap = filename_toc_map(self.toc)
 652         for ID in self.spine:
 653             details = self.manifest[ID]
 654             #log(ID, pformat(details))
 655             # ACO MIJENJAO
 656             try:
 657                 root = self.get_tree_by_id(ID).getroot()
 658             except Exception, e:
 659                 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
 660                 continue
 661             #handle any TOC points in this file
 662             for point in tocmap[details['url']]:
 663                 #if the url has a #identifier, use it. Otherwise, make
 664                 #one up, using a hidden element at the beginning of
 665                 #the inserted document.
 666                 #XXX this will break if different files use the same ids
 667                 #XXX should either replace all, or replace selectively.
 668                 if point['fragment']:
 669                     fragment = point['fragment']
 670                 else:
 671                     body = _find_tag(root, 'body')
 672                     fragment = '%s_%s' % (self.cookie, point['index'])
 673                     #reuse first tag if it is suitable.
 674                     if (len(body) and
 675                         body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
 676                         if body[0].get('id') is None:
 677                             body[0].set('id', fragment)
 678                         else:
 679                             fragment = body[0].get('id')
 680                         #the chapter starts with a heading. that heading should be the chapter name.
 681                         if body[0].tag in ('h1', 'h2', 'h3'):
 682                             #log('chapter has title "%s", found html title "%s"' %
 683                             #    (point['title'], body[0].text_content()))
 684                             point['html_title'] = body[0].text_content()
 685                     else:
 686                         marker = body.makeelement('div', style="display:none",
 687                                                   id=fragment)
 688                         body.insert(0, marker)
 689                 point['html_id'] = fragment
 690
 691             add_guts(root, doc)
 692         return doc
 693
 694     def unpack_static(self):
 695         """Extract static files from the zip for the html to refer to."""
 696         static_files = [x['url'] for x in self.manifest.values()
 697                         if x['url'].startswith('static')]
 698         if static_files:
 699             os.mkdir(self.filepath('static'))
 700
 701         for name in static_files:
 702             s = self.store.read(name)
 703             f = open(self.filepath(name), 'w')
 704             f.write(s)
 705             f.close()
 706         self.notify_watcher()
 707
 708     def load_book(self):
 709         """"""
 710         #XXX concatenate the HTML to match how TWiki version worked.
 711         # This is perhaps foolishly early -- throwing away useful boundaries.
 712         self.unpack_static()
 713         self.tree = self.concat_html()
 714         self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
 715
 716         self.headings = [x for x in self.tree.cssselect('h1')]
 717         if self.headings:
 718             self.headings[0].set('class', "first-heading")
 719         for h1 in self.headings:
 720             h1.title = h1.text_content().strip()
 721         self.notify_watcher()
 722
 723     def make_contents(self):
 724         """Generate HTML containing the table of contents.  This can
 725         only be done after the main PDF has been made, because the
 726         page numbers are contained in the PDF outline."""
 727         header = '<table class="toc">\n'
 728         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 729                     '<td class="pagenumber">%s</td></tr>\n')
 730         empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
 731         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 732         footer = '\n</table>'
 733
 734         contents = []
 735
 736         chapter = 1
 737         page_num = 1
 738         #log(self.outline_contents)
 739         outline_contents = iter(self.outline_contents)
 740
 741         for section in self.toc:
 742             if not section.get('children'):
 743                 contents.append(empty_section_tmpl % section['title'])
 744                 continue
 745             contents.append(section_tmpl % section['title'])
 746
 747             for point in section['children']:
 748                 try:
 749                     level = 99
 750                     while level > 1:
 751                         h1_text, level, page_num = outline_contents.next()
 752                 except StopIteration:
 753                     log("contents data not found for %s. Stopping" % (point,))
 754                     break
 755                 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
 756                 chapter += 1
 757
 758         doc = header + '\n'.join(contents) + footer
 759         if isinstance(doc, unicode):
 760             doc = doc.encode('utf-8')
 761         self.notify_watcher()
 762         return doc
 763
 764     def add_section_titles(self):
 765         """Add any section heading pages that the TOC.txt file
 766         specifies.  These are sub-book, super-chapter groupings.
 767
 768         Also add initial numbers to chapters.
 769         """
 770         chapter = 1
 771         section = None
 772         #log(self.toc)
 773         for t in self.toc:
 774             #only top level sections get a subsection page,
 775             #and only if they have children.
 776             if t.get('children'):
 777                 section = self.tree.makeelement('div', Class="objavi-subsection")
 778                 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
 779                 heading.text = t['title']
 780                 for child in t['children']:
 781                     item = etree.SubElement(section, 'div', Class="objavi-chapter")
 782                     if 'html_title' in child:
 783                         item.text = child['html_title']
 784                         heading = self.tree.cssselect('#'+ child['html_id'])
 785                         if heading:
 786                             _add_initial_number(heading[0], chapter)
 787                     else:
 788                         item.text = child['title']
 789                     _add_initial_number(item, chapter)
 790                     log(item.text, debug='HTMLGEN')
 791                     chapter += 1
 792                 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
 793                 location = self.tree.cssselect('#'+ t['html_id'])[0]
 794                 location.addprevious(section)
 795
 796
 797         self.notify_watcher()
 798
 799
 800     def add_css(self, css=None, mode='book'):
 801         """If css looks like a url, use it as a stylesheet link.
 802         Otherwise it is the CSS itself, which is saved to a temporary file
 803         and linked to."""
 804         log("css is %r" % css)
 805         htmltree = self.tree
 806         if css is None or not css.strip():
 807             css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
 808             if css_default is None:
 809                 #guess from language -- this should come first
 810                 css_modes = config.LANGUAGE_CSS.get(self.lang,
 811                                                     config.LANGUAGE_CSS['en'])
 812                 css_default = css_modes.get(mode, css_modes[None])
 813             url = css_default
 814         elif not re.match(r'^http://\S+$', css):
 815             url = path2url(self.save_tempfile('objavi.css', css), full=True)
 816         else:
 817             url = css
 818
 819         #find the head -- it's probably first child but lets not assume.
 820         for child in htmltree:
 821             if child.tag == 'head':
 822                 head = child
 823                 break
 824         else:
 825             head = htmltree.makeelement('head')
 826             htmltree.insert(0, head)
 827
 828         link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 829         self.css_url = url
 830         self.notify_watcher()
 831         return url
 832
 833
 834     def _read_localised_template(self, template, fallbacks=['en']):
 835         """Try to get the template in the approriate language, otherwise in english."""
 836         for lang in [self.lang] + fallbacks:
 837             try:
 838                 fn = template % (lang)
 839                 f = open(fn)
 840                 break
 841             except IOError, e:
 842                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 843                 log(e)
 844         template = f.read()
 845         f.close()
 846         return template
 847
 848     def compose_inside_cover(self):
 849         """create the markup for the preamble inside cover."""
 850         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 851
 852         if self.isbn:
 853             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 854         else:
 855             isbn_text = ''
 856
 857         return template % {'date': time.strftime('%Y-%m-%d'),
 858                            'isbn': isbn_text,
 859                            'license': self.license,
 860                            }
 861
 862
 863     def compose_end_matter(self):
 864         """create the markup for the end_matter inside cover.  If
 865         self.isbn is not set, the html will result in a pdf that
 866         spills onto two pages.
 867         """
 868         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 869
 870         d = {'css_url': self.css_url,
 871              'title': self.title
 872              }
 873
 874         if self.isbn:
 875             d['inside_cover_style'] = ''
 876         else:
 877             d['inside_cover_style'] = 'page-break-after: always'
 878
 879         return template % d
 880
 881
 882     def make_epub(self, use_cache=False):
 883         """Make an epub version of the book, using Mike McCabe's
 884         epub module for the Internet Archive."""
 885         ebook = ia_epub.Book(self.publish_file, content_dir='')
 886         def add_file(ID, filename, mediatype, content):
 887             ebook.add_content({'media-type': mediatype.encode('utf-8'),
 888                                'id': ID.encode('utf-8'),
 889                                'href': filename.encode('utf-8'),
 890                                }, content)
 891
 892         toc = self.info['TOC']
 893
 894         #manifest
 895         filemap = {} #map html to corresponding xhtml
 896         spinemap = {} #map IDs to multi-file chapters
 897         for ID in self.manifest:
 898             details = self.manifest[ID]
 899             #log(ID, pformat(details))
 900             fn, mediatype = details['url'], details['mimetype']
 901             content = self.store.read(fn)
 902             if mediatype == 'text/html':
 903                 #convert to application/xhtml+xml, and perhaps split
 904                 c = EpubChapter(self.server, self.book, ID, content,
 905                                 use_cache=use_cache)
 906                 c.remove_bad_tags()
 907                 if fn[-5:] == '.html':
 908                     fnbase = fn[:-5]
 909                 else:
 910                     fnbase = fn
 911                 fnx = fnbase + '.xhtml'
 912                 mediatype = 'application/xhtml+xml'
 913
 914                 fragments = split_html(c.as_xhtml(),
 915                                        compressed_size=self.store.getinfo(fn).compress_size)
 916
 917                 #add the first one as if it is the whole thing (as it often is)
 918                 add_file(ID, fnx, mediatype, fragments[0])
 919                 filemap[fn] = fnx
 920                 if len(fragments) > 1:
 921                     spine_ids = [ID]
 922                     spinemap[ID] = spine_ids
 923                     #add any extras
 924                     for i in range(1, len(fragments)):
 925                         # XXX it is possible for duplicates if another
 926                         # file happens to have this name. Ignore for now
 927                         _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
 928                         spine_ids.append(_id)
 929                         add_file(_id,
 930                                  '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
 931                                  mediatype, fragments[i])
 932
 933             else:
 934                 add_file(ID, fn, mediatype, content)
 935
 936         #toc
 937         ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
 938         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 939
 940         #spine
 941         for ID in self.spine:
 942             if ID in spinemap:
 943                 for x in spinemap[ID]:
 944                     ebook.add_spine_item({'idref': x})
 945             else:
 946                 ebook.add_spine_item({'idref': ID})
 947
 948         #metadata -- no use of attributes (yet)
 949         # and fm: metadata disappears for now
 950         DCNS = config.DCNS
 951         DC = config.DC
 952         meta_info_items = []
 953         for ns, namespace in self.metadata.items():
 954             for keyword, schemes in namespace.items():
 955                 if ns:
 956                     keyword = '{%s}%s' % (ns, keyword)
 957                 for scheme, values in schemes.items():
 958                     for value in values:
 959                         item = {
 960                             'item': keyword,
 961                             'text': value,
 962                             }
 963                         if scheme:
 964                             if keyword in (DCNS + 'creator', DCNS + 'contributor'):
 965                                 item['atts'] = {'role': scheme}
 966                             else:
 967                                 item['atts'] = {'scheme': scheme}
 968
 969         has_authors = 'creator' in self.metadata[DC]
 970         if not has_authors and config.CLAIM_UNAUTHORED:
 971             authors = []
 972             for x in self.metadata[DC]['creator'].values():
 973                 authors.extend(x)
 974
 975             meta_info_items.append({'item': DCNS + 'creator',
 976                                     'text': 'The Contributors'})
 977
 978             meta_info_items.append({'item': DCNS + 'rights',
 979                                     'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 980                                    )
 981
 982         tree_str = ia_epub.make_opf(meta_info_items,
 983                                     ebook.manifest_items,
 984                                     ebook.spine_items,
 985                                     ebook.guide_items,
 986                                     ebook.cover_id)
 987         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 988         ebook.z.close()
 989         self.notify_watcher()
 990
 991
 992     def publish_s3(self):
 993         """Push the book's epub to archive.org, using S3."""
 994         #XXX why only epub?
 995         secrets = {}
 996         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 997             fn = getattr(config, x)
 998             f = open(fn)
 999             secrets[x] = f.read().strip()
1000             f.close()
1001
1002         now = time.strftime('%F')
1003         s3output = self.filepath('s3-output.txt')
1004         s3url, detailsurl = find_archive_urls(self.book, self.bookname)
1005         headers = [
1006             'x-amz-auto-make-bucket:1',
1007             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
1008             'x-archive-meta-mediatype:texts',
1009             'x-archive-meta-collection:opensource',
1010             'x-archive-meta-title:%s' % (self.book,),
1011             'x-archive-meta-date:%s' % (now,),
1012             'x-archive-meta-creator:FLOSS Manuals Contributors',
1013             ]
1014
1015         if self.license in config.LICENSES:
1016             headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
1017
1018         argv = ['curl', '--location', '-s', '-o', s3output]
1019         for h in headers:
1020             argv.extend(('--header', h))
1021         argv.extend(('--upload-file', self.publish_file, s3url,))
1022
1023         log(' '.join(repr(x) for x in argv))
1024         check_call(argv, stdout=sys.stderr)
1025         self.notify_watcher()
1026         return detailsurl, s3url
1027
1028
1029     def spawn_x(self):
1030         """Start an Xvfb instance, using a new server number.  A
1031         reference to it is stored in self.xvfb, which is used to kill
1032         it when the pdf is done.
1033
1034         Note that Xvfb doesn't interact well with dbus which is
1035         present on modern desktops.
1036         """
1037         #Find an unused server number (in case two cgis are running at once)
1038         while True:
1039             servernum = random.randrange(50, 500)
1040             if not os.path.exists('/tmp/.X%s-lock' % servernum):
1041                 break
1042
1043         self.xserver_no = ':%s' % servernum
1044
1045         authfile = self.filepath('Xauthority')
1046         os.environ['XAUTHORITY'] = authfile
1047
1048         #mcookie(1) eats into /dev/random, so avoid that
1049         from hashlib import md5
1050         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
1051         mcookie = m.hexdigest()
1052
1053         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
1054
1055         self.xvfb = Popen(['Xvfb', self.xserver_no,
1056                            '-screen', '0', '1024x768x24',
1057                            '-pixdepths', '32',
1058                            #'-blackpixel', '0',
1059                            #'-whitepixel', str(2 ** 24 -1),
1060                            #'+extension', 'Composite',
1061                            '-dpi', '96',
1062                            #'-kb',
1063                            '-nolisten', 'tcp',
1064                            ])
1065
1066         # We need to wait a bit before the Xvfb is ready.  but the
1067         # downloads are so slow that that probably doesn't matter
1068
1069         self.xvfb_ready_time = time.time() + 2
1070
1071         os.environ['DISPLAY'] = self.xserver_no
1072         log(self.xserver_no)
1073
1074     def wait_for_xvfb(self):
1075         """wait until a previously set time before continuing.  This
1076         is so Xvfb has time to properly start."""
1077         if hasattr(self, 'xvfb'):
1078             d = self.xvfb_ready_time - time.time()
1079             if d > 0:
1080                 time.sleep(d)
1081                 self.notify_watcher()
1082
1083     def cleanup_x(self):
1084         """Try very hard to kill off Xvfb.  In addition to killing
1085         this instance's xvfb, occasionally (randomly) search for
1086         escaped Xvfb instances and kill those too."""
1087         if not hasattr(self, 'xvfb'):
1088             return
1089         check_call(['xauth', 'remove', self.xserver_no])
1090         p = self.xvfb
1091         log("trying to kill Xvfb %s" % p.pid)
1092         os.kill(p.pid, 15)
1093         for i in range(10):
1094             if p.poll() is not None:
1095                 log("%s died with %s" % (p.pid, p.poll()))
1096                 break
1097             log("%s not dead yet" % p.pid)
1098             time.sleep(0.2)
1099         else:
1100             log("Xvfb would not die! kill -9! kill -9!")
1101             try:
1102                 os.kill(p.pid, 9)
1103             except OSError, e:
1104                 log(e)
1105
1106         if random.random() < 0.1:
1107             # occasionally kill old xvfbs and soffices, if there are any.
1108             self.kill_old_processes()
1109
1110     def kill_old_processes(self):
1111         """Sometimes, despite everything, Xvfb or soffice instances
1112         hang around well after they are wanted -- for example if the
1113         cgi process dies particularly badly. So kill them if they have
1114         been running for a long time."""
1115         log("running kill_old_processes")
1116         killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1117                                    os.path.basename(config.HTML2ODT),
1118                                    os.path.basename(config.WKHTMLTOPDF),
1119                                    ])
1120         p = Popen(['ps', '-C', killable_names,
1121                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
1122         data = p.communicate()[0].strip()
1123         if data:
1124             lines = data.split('\n')
1125             pids = []
1126             for line in lines:
1127                 log('dealing with ps output "%s"' % line)
1128                 try:
1129                     pid, days, hours, minutes, seconds \
1130                          = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1131                 except AttributeError:
1132                     log("Couldn't parse that line!")
1133                 # 50 minutes should be enough xvfb time for anyone
1134                 if days or hours or int(minutes) > 50:
1135                     pid = int(pid)
1136                     log("going to kill pid %s" % pid)
1137                     os.kill(pid, 15)
1138                     pids.append(pid)
1139
1140             time.sleep(1.0)
1141             for pid in pids:
1142                 #try again in case any are lingerers
1143                 try:
1144                     os.kill(int(pid), 9)
1145                 except OSError, e:
1146                     log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1147                     continue
1148                 log('killing %s with -9' % pid)
1149         self.notify_watcher()
1150
1151     def cleanup(self):
1152         self.cleanup_x()
1153         if not config.KEEP_TEMP_FILES:
1154             for fn in os.listdir(self.workdir):
1155                 os.remove(os.path.join(self.workdir, fn))
1156             os.rmdir(self.workdir)
1157         else:
1158             log("NOT removing '%s', containing the following files:" % self.workdir)
1159             log(*os.listdir(self.workdir))
1160
1161         self.notify_watcher()
1162
1163
1164 def use_cache():
1165     return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1166
1167 def _read_cached_zip(server, book, max_age):
1168     #find a recent zip if possible
1169     prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1170     from glob import glob
1171     zips = sorted(glob(prefix + '*.zip'))
1172     if not zips:
1173         log("no cached booki-zips matching %s*.zip" % (prefix,))
1174         return None
1175     zipname = zips[-1]
1176     cutoff = time.time() - max_age * 60
1177     log(repr(zipname))
1178     try:
1179         date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1180         if date > cutoff:
1181             f = open(zipname)
1182             blob = f.read()
1183             f.close()
1184             return blob, zipname
1185         log("%s is too old, must reload" % zipname)
1186         return None
1187     except (IOError, IndexError, ValueError), e:
1188         log('could not make sense of %s: got exception %s' % (zipname, e))
1189         return None
1190
1191
1192 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1193     interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1194     try:
1195         url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1196                                             'server': server, 'book':book}
1197     except KeyError:
1198         raise NotImplementedError("Can't handle '%s' interface" % interface)
1199
1200     if use_cache() and max_age < 0:
1201         #default to 12 hours cache on objavi.halo.gen.nz
1202         max_age = 12 * 60
1203
1204     if max_age:
1205         log('WARNING: trying to use cached booki-zip',
1206             'If you are debugging booki-zip creation, you will go CRAZY'
1207             ' unless you switch this off')
1208         blob_and_name = _read_cached_zip(server, book, max_age)
1209         if blob_and_name is not None:
1210             return blob_and_name
1211
1212     log('fetching zip from %s'% url)
1213     f = urlopen(url)
1214     blob = f.read()
1215     f.close()
1216     if save:
1217         if filename is None:
1218             filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1219                                   make_book_name(book, server, '.zip'))
1220         f = open(filename, 'w')
1221         f.write(blob)
1222         f.close()
1223     return blob, filename
1224
1225
1226 def split_html(html, compressed_size=None, fix_markup=False):
1227     """Split long html files into pieces that will work nicely on a
1228     Sony Reader."""
1229     if compressed_size is None:
1230         import zlib
1231         compressed_size = len(zlib.compress(html))
1232
1233     splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1234                  len(html) // config.EPUB_FILE_SIZE_MAX)
1235     log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1236
1237     if not splits:
1238         return [html]
1239
1240     if fix_markup:
1241         #remove '<' in attributes etc, which makes the marker
1242         #insertion more reliable
1243         html = etree.tostring(lxml.html.fromstring(html),
1244                               encoding='UTF-8',
1245                               #method='html'
1246                               )
1247
1248     target = len(html) // (splits + 1)
1249     s = 0
1250     fragments = []
1251     for i in range(splits):
1252         e = html.find('<', target * (i + 1))
1253         fragments.append(html[s:e])
1254         fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1255         s = e
1256     fragments.append(html[s:])
1257
1258     #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1259     chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1260     return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]
1261