objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 import copy
  28 from subprocess import Popen, check_call, PIPE
  29 from cStringIO import StringIO
  30 from urllib2 import urlopen, HTTPError
  31 import zipfile
  32 import traceback
  33 from string import ascii_letters
  34 from pprint import pformat
  35
  36 try:
  37     import json
  38 except ImportError:
  39     import simplejson as json
  40
  41 import lxml.html
  42 from lxml import etree
  43
  44 from objavi import config, epub_utils
  45 from objavi.book_utils import log, run, make_book_name, guess_lang, guess_text_dir
  46 from objavi.book_utils import ObjaviError, log_types
  47 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline, parse_extracted_outline
  48 from objavi.epub import add_guts, _find_tag
  49 from objavi.xhtml_utils import EpubChapter, split_tree
  50 from objavi.cgi_utils import url2path, path2url
  51
  52 from iarchive import epub as ia_epub
  53 from booki.bookizip import get_metadata, add_metadata
  54
  55 TMPDIR = os.path.abspath(config.TMPDIR)
  56 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', config.HTDOCS)
  57 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  58
  59 def find_archive_urls(bookid, bookname):
  60     s3url = 'http://s3.us.archive.org/booki-%s/%s' % (bookid, bookname)
  61     detailsurl = 'http://archive.org/details/booki-%s' % (bookid,)
  62     return (s3url, detailsurl)
  63
  64 def _get_best_title(tocpoint):
  65     if 'html_title' in tocpoint:
  66         return tocpoint['html_title']
  67     if 'title' in tocpoint:
  68         return tocpoint['title']
  69     return 'Untitled'
  70
  71
  72 def _add_initial_number(e, n):
  73     """Put a styled chapter number n at the beginning of element e."""
  74     initial = e.makeelement("strong", Class="initial")
  75     e.insert(0, initial)
  76     initial.tail = ' '
  77     if e.text is not None:
  78         initial.tail += e.text
  79     e.text = ''
  80     initial.text = "%s." % n
  81
  82 def expand_toc(toc, depth=1, index=0):
  83     """Reformat toc slightly for convenience"""
  84     for item in toc:
  85         url = item['url'].lstrip('/')
  86         bits = url.split('#', 1)
  87         filename = bits[0]
  88         fragment = (bits[1] if len(bits) == 2 else None)
  89         item['depth'] = depth
  90         item["filename"] = filename
  91         item["fragment"] = fragment
  92         item["index"] = index
  93         index += 1
  94         if 'children' in item:
  95             index = expand_toc(item['children'], depth + 1, index)
  96     return index
  97
  98 def _serialise(rtoc, stoc, depth):
  99     for item in rtoc:
 100         url = item['url'].lstrip('/')
 101         bits = url.split('#', 1)
 102         filename = bits[0]
 103         fragment = (bits[1] if len(bits) == 2 else None)
 104         stoc.append({"depth": depth,
 105                      "title": item['title'],
 106                      "url": url,
 107                      "filename": filename,
 108                      "fragment": fragment,
 109                      "type": item['type']
 110                      })
 111         if 'children' in item:
 112             _serialise(item['children'], stoc, depth + 1)
 113
 114
 115 def serialise_toc(rtoc):
 116     """Take the recursive TOC structure and turn it into a list of
 117     serial points.  Reformat some things for convenience."""
 118     stoc = []
 119     _serialise(rtoc, stoc, 1)
 120     for i, x in enumerate(stoc):
 121         x['position'] = i
 122     return stoc
 123
 124 def filename_toc_map(rtoc):
 125     tocmap = {}
 126     #log(rtoc)
 127     def traverse(toc):
 128         for point in toc:
 129             #log(point.keys())
 130             tocmap.setdefault(point['filename'], []).append(point)
 131             if 'children' in point:
 132                 traverse(point['children'])
 133     traverse(rtoc)
 134     return tocmap
 135
 136 def save_data(fn, data):
 137     """Save without tripping up on unicode"""
 138     if isinstance(data, unicode):
 139         data = data.encode('utf8', 'ignore')
 140     f = open(fn, 'w')
 141     f.write(data)
 142     f.close()
 143
 144
 145 class Book(object):
 146     page_numbers = 'latin'
 147     preamble_page_numbers = 'roman'
 148
 149     def notify_watcher(self, message=None):
 150         if self.watchers:
 151             if  message is None:
 152                 #message is the name of the caller
 153                 message = traceback.extract_stack(None, 2)[0][2]
 154             log("notify_watcher called with '%s'" % message)
 155             for w in self.watchers:
 156                 w(message)
 157
 158     def __enter__(self):
 159         return self
 160
 161     def __exit__(self, exc_type, exc_value, tb):
 162         self.notify_watcher(config.FINISHED_MESSAGE)
 163         self.cleanup()
 164         #could deal with exceptions here and return true
 165
 166
 167     def __init__(self, book, server, bookname,
 168                  page_settings=None, watchers=None, isbn=None,
 169                  license=config.DEFAULT_LICENSE, title=None,
 170                  max_age=0):
 171         log("*** Starting new book %s ***" % bookname)
 172         self.watchers = set()
 173         if watchers is not None:
 174             self.watchers.update(watchers)
 175         self.notify_watcher('start')
 176         self.bookname = bookname
 177         self.book = book
 178         self.server = server
 179         self.cookie = ''.join(random.sample(ascii_letters, 10))
 180         try:
 181             blob, self.bookizip_file = fetch_zip(server, book, save=True, max_age=max_age)
 182         except HTTPError, e:
 183             traceback.print_exc()
 184             self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
 185             #not much to do?
 186             #raise 502 Bad Gateway ?
 187             sys.exit()
 188         f = StringIO(blob)
 189         self.notify_watcher('fetch_zip')
 190         self.store = zipfile.ZipFile(f, 'r')
 191         self.info = json.loads(self.store.read('info.json'))
 192         for k in ('manifest', 'metadata', 'spine', 'TOC'):
 193             if k not in self.info:
 194                 raise ObjaviError('info.json of %s lacks vital element "%s"' %
 195                                   (bookname, k))
 196             #check types also?
 197
 198         self.metadata = self.info['metadata']
 199         self.spine = self.info['spine']
 200         self.manifest = self.info['manifest']
 201
 202         if server == config.LOCALHOST: # [DEPRECATED]
 203             server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
 204             book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
 205
 206         log(pformat(self.metadata))
 207         self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
 208         if not self.lang:
 209             self.lang = guess_lang(server, book)
 210             log('guessed lang as %s' % self.lang)
 211
 212         self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
 213         if not self.toc_header:
 214             self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
 215
 216         self.dir = str(get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0])
 217         if not self.dir:
 218             self.dir = guess_text_dir(server, book)
 219
 220         #Patch in the extra metadata. (lang and dir may be set from config)
 221         #these should be read from zip -- so should go into zip?
 222         for var, key, scheme, ns in (
 223             (isbn, 'id', 'ISBN', config.DC),
 224             (license, 'rights', 'License', config.DC),
 225             (title, 'title', '', config.DC),
 226             (self.lang, 'language', '', config.DC),
 227             (self.dir, 'dir', '', config.FM),
 228             ):
 229             if var is not None:
 230                 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
 231
 232         self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
 233         self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
 234
 235         self.toc = self.info['TOC']
 236         expand_toc(self.toc)
 237
 238         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 239         os.chmod(self.workdir, 0755)
 240
 241         self.body_html_file = self.filepath('body.html')
 242         self.body_pdf_file = self.filepath('body.pdf')
 243         self.preamble_html_file = self.filepath('preamble.html')
 244         self.preamble_pdf_file = self.filepath('preamble.pdf')
 245         self.tail_html_file = self.filepath('tail.html')
 246         self.tail_pdf_file = self.filepath('tail.pdf')
 247         self.isbn_pdf_file = None
 248         self.pdf_file = self.filepath('final.pdf')
 249         self.body_odt_file = self.filepath('body.odt')
 250         self.outline_file = self.filepath('outline.txt')
 251
 252         self.publish_file = os.path.abspath(os.path.join(config.PUBLISH_DIR, bookname))
 253
 254         if page_settings is not None:
 255             self.maker = PageSettings(**page_settings)
 256
 257         if title is not None:
 258             self.title = title
 259         else:
 260             titles = get_metadata(self.metadata, 'title')
 261             if titles:
 262                 self.title = titles[0]
 263             else:
 264                 self.title = 'A Book About ' + self.book
 265         if isinstance(self.title, unicode):
 266             self.title = self.title.encode('utf-8')
 267
 268         self.notify_watcher()
 269
 270
 271     if config.TRY_BOOK_CLEANUP_ON_DEL:
 272         #Dont even define __del__ if it is not used.
 273         _try_cleanup_on_del = True
 274         def __del__(self):
 275             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 276                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 277                 self.cleanup()
 278
 279     def get_tree_by_id(self, id):
 280         """get an HTML tree from the given manifest ID"""
 281         name = self.manifest[id]['url']
 282         mimetype = self.manifest[id]['mimetype']
 283         s = self.store.read(name)
 284         f = StringIO(s)
 285         if mimetype == 'text/html':
 286             try:
 287                 tree = lxml.html.parse(f)
 288             except etree.XMLSyntaxError, e:
 289                 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
 290                     (id, name, s[:20], e))
 291                 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 292         elif 'xml' in mimetype: #XXX or is this just asking for trouble?
 293             tree = etree.parse(f)
 294         else:
 295             tree = f.read()
 296         f.close()
 297         return tree
 298
 299     def filepath(self, fn):
 300         return os.path.join(self.workdir, fn)
 301
 302     def save_tempfile(self, fn, data):
 303         """Save the data in a temporary directory that will be cleaned
 304         up when all is done.  Return the absolute file path."""
 305         fn = self.filepath(fn)
 306         save_data(fn, data)
 307         return fn
 308
 309     def make_oo_doc(self):
 310         """Make an openoffice document, using the html2odt script."""
 311         self.wait_for_xvfb()
 312         html_text = etree.tostring(self.tree, method="html")
 313         save_data(self.body_html_file, html_text)
 314         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 315         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 316         os.rename(self.body_odt_file, self.publish_file)
 317         self.notify_watcher()
 318
 319     def extract_pdf_outline(self):
 320         """Get the outline (table of contents) for the PDF, which
 321         wkhtmltopdf should have written to a file.  If that file
 322         doesn't exist (or config says not to use it), fall back to
 323         using self._extract_pdf_outline_the_old_way, below.
 324         """
 325         if config.USE_DUMP_OUTLINE:
 326             try:
 327                 self.outline_contents, number_of_pages = \
 328                                        parse_extracted_outline(self.outline_file)
 329
 330             except Exception, e:
 331                 traceback.print_exc()
 332                 number_of_pages = self._extract_pdf_outline_the_old_way()
 333         else:
 334             number_of_pages = self._extract_pdf_outline_the_old_way()
 335
 336         self.notify_watcher()
 337         return number_of_pages
 338
 339     def _extract_pdf_outline_the_old_way(self):
 340         """Try to get the PDF outline using pdftk.  This doesn't work
 341         well with all scripts."""
 342         debugf = self.filepath('extracted-outline.txt')
 343         self.outline_contents, number_of_pages = \
 344                 parse_outline(self.body_pdf_file, 1, debugf)
 345
 346         if not self.outline_contents:
 347             #probably problems with international text. need a horrible hack
 348             log('no outline: trying again with ascii headings')
 349             import copy
 350             tree = copy.deepcopy(self.tree)
 351             titlemap = {}
 352             for tag in ('h1', 'h2', 'h3', 'h4'):
 353                 for i, e in enumerate(tree.getiterator(tag)):
 354                     key = "%s_%s" % (tag, i)
 355                     titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
 356                     del e[:]
 357                     if tag == 'h1':
 358                         e = lxml.etree.SubElement(e, "strong", Class="initial")
 359                     e.text = key
 360                     log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
 361
 362             ascii_html_file = self.filepath('body-ascii-headings.html')
 363             ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
 364             html_text = lxml.etree.tostring(tree, method="html")
 365             save_data(ascii_html_file, html_text)
 366             self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
 367             debugf = self.filepath('ascii-extracted-outline.txt')
 368             ascii_contents, number_of_ascii_pages = \
 369                 parse_outline(ascii_pdf_file, 1, debugf)
 370             self.outline_contents = []
 371             log ("number of pages: %s, post ascii: %s" %
 372                  (number_of_pages, number_of_ascii_pages))
 373             for ascii_title, depth, pageno in ascii_contents:
 374                 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
 375                     ascii_title = ascii_title[:-4]
 376                 if ' ' in ascii_title:
 377                     ascii_title = ascii_title.rsplit(' ', 1)[1]
 378                 title = titlemap.get(ascii_title, '')
 379                 log((ascii_title, title, depth, pageno))
 380
 381                 self.outline_contents.append((title, depth, pageno))
 382
 383         return number_of_pages
 384
 385     def make_body_pdf(self):
 386         """Make a pdf of the HTML, using webkit"""
 387         #1. Save the html
 388         html_text = etree.tostring(self.tree, method="html")
 389         save_data(self.body_html_file, html_text)
 390
 391         #2. Make a pdf of it
 392         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True, outline_file=self.outline_file)
 393         self.notify_watcher('generate_pdf')
 394
 395         n_pages = self.extract_pdf_outline()
 396
 397         log ("found %s pages in pdf" % n_pages)
 398         #4. resize pages, shift gutters, even pages
 399         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 400         self.notify_watcher('reshape_pdf')
 401
 402         #5 add page numbers
 403         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 404                               numbers=self.page_numbers)
 405         self.notify_watcher("number_pdf")
 406         self.notify_watcher()
 407
 408     def make_preamble_pdf(self):
 409         contents = self.make_contents()
 410         inside_cover_html = self.compose_inside_cover()
 411         log_types(self.dir, self.css_url, self.title, inside_cover_html,
 412                   self.toc_header, contents, self.title)
 413
 414         html = ('<html dir="%s"><head>\n'
 415                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 416                 '<link rel="stylesheet" href="%s" />\n'
 417                 '</head>\n<body>\n'
 418                 '<h1 class="frontpage">%s</h1>'
 419                 '%s\n'
 420                 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
 421                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 422                 '<!--%s--></div></body></html>'
 423                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 424                      self.toc_header, contents, self.title)
 425         save_data(self.preamble_html_file, html)
 426
 427         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 428
 429         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 430
 431         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 432                             numbers=self.preamble_page_numbers,
 433                             number_start=-2)
 434
 435         self.notify_watcher()
 436
 437     def make_end_matter_pdf(self):
 438         """Make an inside back cover and a back cover.  If there is an
 439         isbn number its barcode will be put on the back cover."""
 440         if self.isbn:
 441             self.isbn_pdf_file = self.filepath('isbn.pdf')
 442             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 443             self.notify_watcher('make_barcode_pdf')
 444
 445         end_matter = self.compose_end_matter()
 446         #log(end_matter)
 447         save_data(self.tail_html_file, end_matter.decode('utf-8'))
 448         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 449
 450         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 451                                centre_end=True, even_pages=False)
 452         self.notify_watcher()
 453
 454     def make_book_pdf(self):
 455         """A convenient wrapper of a few necessary steps"""
 456         # now the Xvfb server is needed. make sure it has had long enough to get going
 457         self.wait_for_xvfb()
 458         self.make_body_pdf()
 459         self.make_preamble_pdf()
 460         self.make_end_matter_pdf()
 461
 462         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 463                     self.body_pdf_file, self.tail_pdf_file,
 464                     self.isbn_pdf_file)
 465
 466         self.notify_watcher('concatenated_pdfs')
 467
 468     def make_templated_html(self, template=None, zip=False, index=config.TEMPLATING_INDEX_FIRST):
 469         """Make a templated html version of the book"""
 470         #set up the directory and static files
 471         self.unpack_static()
 472         destdir = self.filepath('html')
 473         os.mkdir(destdir)
 474         os.rename(self.filepath('static'), self.filepath(os.path.join(destdir, 'static')))
 475
 476         if not template:
 477             template_tree = lxml.html.parse(config.TEMPLATING_DEFAULT_TEMPLATE).getroot()
 478         else:
 479             template_tree = lxml.html.document_fromstring(template)
 480
 481         tocmap = filename_toc_map(self.toc)
 482         contents_name, first_name = config.TEMPLATING_INDEX_MODES[index]
 483
 484         #build a contents page and a contents menu
 485         #We can't make this in the same pass because the menu needs to
 486         #go in every page (i.e., into the template)
 487         menu = etree.Element('ul', Class=config.TEMPLATING_MENU_ELEMENT)
 488         contents = etree.Element('div', Class=config.TEMPLATING_REPLACED_ELEMENT)
 489         etree.SubElement(contents, 'h1').text = self.title
 490
 491         savename = first_name
 492         for ID in self.spine:
 493             filename = self.manifest[ID]['url']
 494             #handle any TOC points in this file.
 495             for point in tocmap[filename]:
 496                 if point['type'] == 'booki-section':
 497                     etree.SubElement(contents, 'h2').text = point['title']
 498                     etree.SubElement(menu, 'li', Class='booki-section').text = point['title']
 499                 else:
 500                     if savename is None:
 501                         savename = filename
 502                     div = etree.SubElement(contents, 'div')
 503                     etree.SubElement(div, 'a', href=savename).text = point['title']
 504                     li = etree.SubElement(menu, 'li')
 505                     li.tail = '\n'
 506                     etree.SubElement(li, 'a', href=savename).text = point['title']
 507                     savename = None
 508         #put the menu into the template (if it wants it)
 509         for e in template_tree.iterdescendants(config.TEMPLATING_MENU_ELEMENT):
 510             e.getparent().replace(e, menu)
 511
 512         #function to template content and write to disk
 513         def save_content(content, title, filename):
 514             content.set('id', config.TEMPLATING_CONTENTS_ID)
 515             dest = copy.deepcopy(template_tree)
 516             for e in dest.iterdescendants(config.TEMPLATING_REPLACED_ELEMENT):
 517                 e.getparent().replace(e, content)
 518             for e in dest.iterdescendants('title'):
 519                 e.text = title
 520             self.save_tempfile(os.path.join(destdir, filename), lxml.html.tostring(dest))
 521
 522
 523         #write the contents to a file. (either index.html or contents.html)
 524         save_content(contents, self.title, contents_name)
 525
 526         savename = first_name
 527         #and now write each chapter to a file
 528         for ID in self.spine:
 529             filename = self.manifest[ID]['url']
 530             try:
 531                 root = self.get_tree_by_id(ID).getroot()
 532                 body = root.find('body')
 533             except Exception, e:
 534                 log("hit %s when trying book.get_tree_by_id(%s).getroot().find('body')" % (e, ID))
 535                 body = etree.Element('body')
 536
 537             #handle any TOC points in this file.  There should only be one!
 538             for point in tocmap[filename]:
 539                 if point['type'] != 'booki-section':
 540                     title = point['title']
 541                     break
 542             else:
 543                 title = self.title
 544
 545             if savename is None:
 546                 savename = filename
 547             save_content(body, title, savename)
 548             savename = None
 549         log(destdir, self.publish_file)
 550         os.rename(destdir, self.publish_file)
 551         self.notify_watcher()
 552
 553
 554     def make_simple_pdf(self, mode):
 555         """Make a simple pdf document without contents or separate
 556         title page.  This is used for multicolumn newspapers and for
 557         web-destined pdfs."""
 558         self.wait_for_xvfb()
 559         #0. Add heading to begining of html
 560         body = list(self.tree.cssselect('body'))[0]
 561         e = body.makeelement('h1', {'id': 'book-title'})
 562         e.text = self.title
 563         body.insert(0, e)
 564         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 565         e.addnext(intro)
 566
 567         #0.5 adjust parameters to suit the particular kind of output
 568         if mode == 'web':
 569             self.maker.gutter = 0
 570
 571         #1. Save the html
 572         html_text = etree.tostring(self.tree, method="html")
 573         save_data(self.body_html_file, html_text)
 574
 575         #2. Make a pdf of it (direct to to final pdf)
 576         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True, outline_file=self.outline_file)
 577         self.notify_watcher('generate_pdf')
 578         n_pages = count_pdf_pages(self.pdf_file)
 579
 580         if mode != 'web':
 581             #3. resize pages and shift gutters.
 582             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 583             self.notify_watcher('reshape_pdf')
 584
 585             #4. add page numbers
 586             self.maker.number_pdf(self.pdf_file, n_pages,
 587                                   dir=self.dir, numbers=self.page_numbers)
 588             self.notify_watcher("number_pdf")
 589         self.notify_watcher()
 590
 591
 592     def rotate180(self):
 593         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 594         presses."""
 595         rotated = self.filepath('final-rotate.pdf')
 596         unrotated = self.filepath('final-pre-rotate.pdf')
 597         #leave the unrotated pdf intact at first, in case of error.
 598         rotate_pdf(self.pdf_file, rotated)
 599         os.rename(self.pdf_file, unrotated)
 600         os.rename(rotated, self.pdf_file)
 601         self.notify_watcher()
 602
 603     def publish_pdf(self):
 604         """Move the finished PDF to its final resting place"""
 605         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 606         os.rename(self.pdf_file, self.publish_file)
 607         self.notify_watcher()
 608
 609     def publish_bookizip(self):
 610         """Publish the bookizip.  For this, copy rather than move,
 611         because the bookizip might be used by further processing.  If
 612         possible, a hard link is created."""
 613         log("Publishing %r as %r" % (self.bookizip_file, self.publish_file))
 614         try:
 615             run(['cp', '-l', self.bookizip_file, self.publish_file])
 616         except OSError:
 617             run(['cp', self.bookizip_file, self.publish_file])
 618         self.notify_watcher()
 619
 620     def concat_html(self):
 621         """Join all the chapters together into one tree.  Keep the TOC
 622         up-to-date along the way."""
 623
 624         #each manifest item looks like:
 625         #{'contributors': []
 626         #'license': [],
 627         #'mimetype': '',
 628         #'rightsholders': []
 629         #'url': ''}
 630         doc = lxml.html.document_fromstring('<html><body></body></html>')
 631         tocmap = filename_toc_map(self.toc)
 632         for ID in self.spine:
 633             details = self.manifest[ID]
 634             #log(ID, pformat(details))
 635             # ACO MIJENJAO
 636             try:
 637                 root = self.get_tree_by_id(ID).getroot()
 638             except Exception, e:
 639                 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e, ID))
 640                 continue
 641             #handle any TOC points in this file
 642             for point in tocmap[details['url']]:
 643                 #if the url has a #identifier, use it. Otherwise, make
 644                 #one up, using a hidden element at the beginning of
 645                 #the inserted document.
 646                 #XXX this will break if different files use the same ids
 647                 #XXX should either replace all, or replace selectively.
 648                 if point['fragment']:
 649                     fragment = point['fragment']
 650                 else:
 651                     body = _find_tag(root, 'body')
 652                     fragment = '%s_%s' % (self.cookie, point['index'])
 653                     #reuse first tag if it is suitable.
 654                     if (len(body) and
 655                         body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
 656                         if body[0].get('id') is None:
 657                             body[0].set('id', fragment)
 658                         else:
 659                             fragment = body[0].get('id')
 660                         #the chapter starts with a heading. that heading should be the chapter name.
 661                         if body[0].tag in ('h1', 'h2', 'h3'):
 662                             #log('chapter has title "%s", found html title "%s"' %
 663                             #    (point['title'], body[0].text_content()))
 664                             point['html_title'] = body[0].text_content()
 665                     else:
 666                         marker = body.makeelement('div', style="display:none",
 667                                                   id=fragment)
 668                         body.insert(0, marker)
 669                 point['html_id'] = fragment
 670
 671             add_guts(root, doc)
 672         return doc
 673
 674     def unpack_static(self):
 675         """Extract static files from the zip for the html to refer to."""
 676         static_files = [x['url'] for x in self.manifest.values()
 677                         if x['url'].startswith('static')]
 678         if static_files:
 679             os.mkdir(self.filepath('static'))
 680
 681         for name in static_files:
 682             s = self.store.read(name)
 683             f = open(self.filepath(name), 'w')
 684             f.write(s)
 685             f.close()
 686         self.notify_watcher()
 687
 688     def load_book(self):
 689         """"""
 690         #XXX concatenate the HTML to match how TWiki version worked.
 691         # This is perhaps foolishly early -- throwing away useful boundaries.
 692         self.unpack_static()
 693         self.tree = self.concat_html()
 694         self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
 695
 696         self.headings = [x for x in self.tree.cssselect('h1')]
 697         if self.headings:
 698             self.headings[0].set('class', "first-heading")
 699         for h1 in self.headings:
 700             h1.title = h1.text_content().strip()
 701         self.notify_watcher()
 702
 703     def make_contents(self):
 704         """Generate HTML containing the table of contents.  This can
 705         only be done after the main PDF has been made, because the
 706         page numbers are contained in the PDF outline."""
 707         header = '<table class="toc">\n'
 708         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 709                     '<td class="pagenumber">%s</td></tr>\n')
 710         empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
 711         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 712         footer = '\n</table>'
 713
 714         contents = []
 715
 716         chapter = 1
 717         page_num = 1
 718         #log(self.outline_contents)
 719         outline_contents = iter(self.outline_contents)
 720
 721         for section in self.toc:
 722             if not section.get('children'):
 723                 contents.append(empty_section_tmpl % section['title'])
 724                 continue
 725             contents.append(section_tmpl % section['title'])
 726
 727             for point in section['children']:
 728                 try:
 729                     level = 99
 730                     while level > 1:
 731                         h1_text, level, page_num = outline_contents.next()
 732                 except StopIteration:
 733                     log("contents data not found for %s. Stopping" % (point,))
 734                     break
 735                 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
 736                 chapter += 1
 737
 738         doc = header + '\n'.join(contents) + footer
 739         if isinstance(doc, unicode):
 740             doc = doc.encode('utf-8')
 741         self.notify_watcher()
 742         return doc
 743
 744     def add_section_titles(self):
 745         """Add any section heading pages that the TOC.txt file
 746         specifies.  These are sub-book, super-chapter groupings.
 747
 748         Also add initial numbers to chapters.
 749         """
 750         chapter = 1
 751         section = None
 752         #log(self.toc)
 753         for t in self.toc:
 754             #only top level sections get a subsection page,
 755             #and only if they have children.
 756             if t.get('children'):
 757                 section = self.tree.makeelement('div', Class="objavi-subsection")
 758                 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
 759                 heading.text = t['title']
 760                 for child in t['children']:
 761                     item = etree.SubElement(section, 'div', Class="objavi-chapter")
 762                     if 'html_title' in child:
 763                         item.text = child['html_title']
 764                         heading = self.tree.cssselect('#'+ child['html_id'])
 765                         if heading:
 766                             _add_initial_number(heading[0], chapter)
 767                     else:
 768                         item.text = child['title']
 769                     _add_initial_number(item, chapter)
 770                     log(item.text, debug='HTMLGEN')
 771                     chapter += 1
 772                 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
 773                 location = self.tree.cssselect('#'+ t['html_id'])[0]
 774                 location.addprevious(section)
 775
 776
 777         self.notify_watcher()
 778
 779
 780     def add_css(self, css=None, mode='book'):
 781         """If css looks like a url, use it as a stylesheet link.
 782         Otherwise it is the CSS itself, which is saved to a temporary file
 783         and linked to."""
 784         log("css is %r" % css)
 785         htmltree = self.tree
 786         if css is None or not css.strip():
 787             css_default = config.SERVER_DEFAULTS[self.server]['css-%s' % mode]
 788             if css_default is None:
 789                 #guess from language -- this should come first
 790                 css_modes = config.LANGUAGE_CSS.get(self.lang,
 791                                                     config.LANGUAGE_CSS['en'])
 792                 css_default = css_modes.get(mode, css_modes[None])
 793             url = css_default
 794         elif not re.match(r'^http://\S+$', css):
 795             url = path2url(self.save_tempfile('objavi.css', css), full=True)
 796         else:
 797             url = css
 798
 799         #find the head -- it's probably first child but lets not assume.
 800         for child in htmltree:
 801             if child.tag == 'head':
 802                 head = child
 803                 break
 804         else:
 805             head = htmltree.makeelement('head')
 806             htmltree.insert(0, head)
 807
 808         link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 809         self.css_url = url
 810         self.notify_watcher()
 811         return url
 812
 813
 814     def _read_localised_template(self, template, fallbacks=['en']):
 815         """Try to get the template in the approriate language, otherwise in english."""
 816         for lang in [self.lang] + fallbacks:
 817             try:
 818                 fn = template % (lang)
 819                 f = open(fn)
 820                 break
 821             except IOError, e:
 822                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 823                 log(e)
 824         template = f.read()
 825         f.close()
 826         return template
 827
 828     def compose_inside_cover(self):
 829         """create the markup for the preamble inside cover."""
 830         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 831
 832         if self.isbn:
 833             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 834         else:
 835             isbn_text = ''
 836
 837         return template % {'date': time.strftime('%Y-%m-%d'),
 838                            'isbn': isbn_text,
 839                            'license': self.license,
 840                            }
 841
 842
 843     def compose_end_matter(self):
 844         """create the markup for the end_matter inside cover.  If
 845         self.isbn is not set, the html will result in a pdf that
 846         spills onto two pages.
 847         """
 848         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 849
 850         d = {'css_url': self.css_url,
 851              'title': self.title
 852              }
 853
 854         if self.isbn:
 855             d['inside_cover_style'] = ''
 856         else:
 857             d['inside_cover_style'] = 'page-break-after: always'
 858
 859         return template % d
 860
 861
 862     def make_epub(self, use_cache=False):
 863         """Make an epub version of the book, using Mike McCabe's
 864         epub module for the Internet Archive."""
 865         ebook = ia_epub.Book(self.publish_file, content_dir='')
 866         def add_file(ID, filename, mediatype, content):
 867             ebook.add_content({'media-type': mediatype.encode('utf-8'),
 868                                'id': ID.encode('utf-8'),
 869                                'href': filename.encode('utf-8'),
 870                                }, content)
 871
 872         toc = self.info['TOC']
 873
 874         #manifest
 875         filemap = {} #map html to corresponding xhtml
 876         spinemap = {} #map IDs to multi-file chapters
 877         for ID in self.manifest:
 878             details = self.manifest[ID]
 879             #log(ID, pformat(details))
 880             fn, mediatype = details['url'], details['mimetype']
 881             content = self.store.read(fn)
 882             if mediatype == 'text/html':
 883                 #convert to application/xhtml+xml, and perhaps split
 884                 c = EpubChapter(self.server, self.book, ID, content,
 885                                 use_cache=use_cache)
 886                 c.remove_bad_tags()
 887                 if fn[-5:] == '.html':
 888                     fnbase = fn[:-5]
 889                 else:
 890                     fnbase = fn
 891                 fnx = fnbase + '.xhtml'
 892                 mediatype = 'application/xhtml+xml'
 893
 894                 fragments = split_html(c.as_xhtml(),
 895                                        compressed_size=self.store.getinfo(fn).compress_size)
 896
 897                 #add the first one as if it is the whole thing (as it often is)
 898                 add_file(ID, fnx, mediatype, fragments[0])
 899                 filemap[fn] = fnx
 900                 if len(fragments) > 1:
 901                     spine_ids = [ID]
 902                     spinemap[ID] = spine_ids
 903                     #add any extras
 904                     for i in range(1, len(fragments)):
 905                         # XXX it is possible for duplicates if another
 906                         # file happens to have this name. Ignore for now
 907                         _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
 908                         spine_ids.append(_id)
 909                         add_file(_id,
 910                                  '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
 911                                  mediatype, fragments[i])
 912
 913             else:
 914                 add_file(ID, fn, mediatype, content)
 915
 916         #toc
 917         ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
 918         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 919
 920         #spine
 921         for ID in self.spine:
 922             if ID in spinemap:
 923                 for x in spinemap[ID]:
 924                     ebook.add_spine_item({'idref': x})
 925             else:
 926                 ebook.add_spine_item({'idref': ID})
 927
 928         #metadata -- no use of attributes (yet)
 929         # and fm: metadata disappears for now
 930         DCNS = config.DCNS
 931         DC = config.DC
 932         meta_info_items = []
 933         for ns, namespace in self.metadata.items():
 934             for keyword, schemes in namespace.items():
 935                 if ns:
 936                     keyword = '{%s}%s' % (ns, keyword)
 937                 for scheme, values in schemes.items():
 938                     for value in values:
 939                         item = {
 940                             'item': keyword,
 941                             'text': value,
 942                             }
 943                         if scheme:
 944                             if keyword in (DCNS + 'creator', DCNS + 'contributor'):
 945                                 item['atts'] = {'role': scheme}
 946                             else:
 947                                 item['atts'] = {'scheme': scheme}
 948
 949         has_authors = 'creator' in self.metadata[DC]
 950         if not has_authors and config.CLAIM_UNAUTHORED:
 951             authors = []
 952             for x in self.metadata[DC]['creator'].values():
 953                 authors.extend(x)
 954
 955             meta_info_items.append({'item': DCNS + 'creator',
 956                                     'text': 'The Contributors'})
 957
 958             meta_info_items.append({'item': DCNS + 'rights',
 959                                     'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 960                                    )
 961
 962         tree_str = ia_epub.make_opf(meta_info_items,
 963                                     ebook.manifest_items,
 964                                     ebook.spine_items,
 965                                     ebook.guide_items,
 966                                     ebook.cover_id)
 967         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 968         ebook.z.close()
 969         self.notify_watcher()
 970
 971
 972     def publish_s3(self):
 973         """Push the book's epub to archive.org, using S3."""
 974         #XXX why only epub?
 975         secrets = {}
 976         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 977             fn = getattr(config, x)
 978             f = open(fn)
 979             secrets[x] = f.read().strip()
 980             f.close()
 981
 982         now = time.strftime('%F')
 983         s3output = self.filepath('s3-output.txt')
 984         s3url, detailsurl = find_archive_urls(self.book, self.bookname)
 985         headers = [
 986             'x-amz-auto-make-bucket:1',
 987             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
 988             'x-archive-meta-mediatype:texts',
 989             'x-archive-meta-collection:opensource',
 990             'x-archive-meta-title:%s' % (self.book,),
 991             'x-archive-meta-date:%s' % (now,),
 992             'x-archive-meta-creator:FLOSS Manuals Contributors',
 993             ]
 994
 995         if self.license in config.LICENSES:
 996             headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
 997
 998         argv = ['curl', '--location', '-s', '-o', s3output]
 999         for h in headers:
1000             argv.extend(('--header', h))
1001         argv.extend(('--upload-file', self.publish_file, s3url,))
1002
1003         log(' '.join(repr(x) for x in argv))
1004         check_call(argv, stdout=sys.stderr)
1005         self.notify_watcher()
1006         return detailsurl, s3url
1007
1008
1009     def spawn_x(self):
1010         """Start an Xvfb instance, using a new server number.  A
1011         reference to it is stored in self.xvfb, which is used to kill
1012         it when the pdf is done.
1013
1014         Note that Xvfb doesn't interact well with dbus which is
1015         present on modern desktops.
1016         """
1017         #Find an unused server number (in case two cgis are running at once)
1018         while True:
1019             servernum = random.randrange(50, 500)
1020             if not os.path.exists('/tmp/.X%s-lock' % servernum):
1021                 break
1022
1023         self.xserver_no = ':%s' % servernum
1024
1025         authfile = self.filepath('Xauthority')
1026         os.environ['XAUTHORITY'] = authfile
1027
1028         #mcookie(1) eats into /dev/random, so avoid that
1029         from hashlib import md5
1030         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
1031         mcookie = m.hexdigest()
1032
1033         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
1034
1035         self.xvfb = Popen(['Xvfb', self.xserver_no,
1036                            '-screen', '0', '1024x768x24',
1037                            '-pixdepths', '32',
1038                            #'-blackpixel', '0',
1039                            #'-whitepixel', str(2 ** 24 -1),
1040                            #'+extension', 'Composite',
1041                            '-dpi', '96',
1042                            #'-kb',
1043                            '-nolisten', 'tcp',
1044                            ])
1045
1046         # We need to wait a bit before the Xvfb is ready.  but the
1047         # downloads are so slow that that probably doesn't matter
1048
1049         self.xvfb_ready_time = time.time() + 2
1050
1051         os.environ['DISPLAY'] = self.xserver_no
1052         log(self.xserver_no)
1053
1054     def wait_for_xvfb(self):
1055         """wait until a previously set time before continuing.  This
1056         is so Xvfb has time to properly start."""
1057         if hasattr(self, 'xvfb'):
1058             d = self.xvfb_ready_time - time.time()
1059             if d > 0:
1060                 time.sleep(d)
1061                 self.notify_watcher()
1062
1063     def cleanup_x(self):
1064         """Try very hard to kill off Xvfb.  In addition to killing
1065         this instance's xvfb, occasionally (randomly) search for
1066         escaped Xvfb instances and kill those too."""
1067         if not hasattr(self, 'xvfb'):
1068             return
1069         check_call(['xauth', 'remove', self.xserver_no])
1070         p = self.xvfb
1071         log("trying to kill Xvfb %s" % p.pid)
1072         os.kill(p.pid, 15)
1073         for i in range(10):
1074             if p.poll() is not None:
1075                 log("%s died with %s" % (p.pid, p.poll()))
1076                 break
1077             log("%s not dead yet" % p.pid)
1078             time.sleep(0.2)
1079         else:
1080             log("Xvfb would not die! kill -9! kill -9!")
1081             try:
1082                 os.kill(p.pid, 9)
1083             except OSError, e:
1084                 log(e)
1085
1086         if random.random() < 0.1:
1087             # occasionally kill old xvfbs and soffices, if there are any.
1088             self.kill_old_processes()
1089
1090     def kill_old_processes(self):
1091         """Sometimes, despite everything, Xvfb or soffice instances
1092         hang around well after they are wanted -- for example if the
1093         cgi process dies particularly badly. So kill them if they have
1094         been running for a long time."""
1095         log("running kill_old_processes")
1096         killable_names = ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1097                                    os.path.basename(config.HTML2ODT),
1098                                    os.path.basename(config.WKHTMLTOPDF),
1099                                    ])
1100         p = Popen(['ps', '-C', killable_names,
1101                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
1102         data = p.communicate()[0].strip()
1103         if data:
1104             lines = data.split('\n')
1105             pids = []
1106             for line in lines:
1107                 log('dealing with ps output "%s"' % line)
1108                 try:
1109                     pid, days, hours, minutes, seconds \
1110                          = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
1111                 except AttributeError:
1112                     log("Couldn't parse that line!")
1113                 # 50 minutes should be enough xvfb time for anyone
1114                 if days or hours or int(minutes) > 50:
1115                     pid = int(pid)
1116                     log("going to kill pid %s" % pid)
1117                     os.kill(pid, 15)
1118                     pids.append(pid)
1119
1120             time.sleep(1.0)
1121             for pid in pids:
1122                 #try again in case any are lingerers
1123                 try:
1124                     os.kill(int(pid), 9)
1125                 except OSError, e:
1126                     log('PID %s seems dead (re-kill gives %s)' % (pid, e))
1127                     continue
1128                 log('killing %s with -9' % pid)
1129         self.notify_watcher()
1130
1131     def cleanup(self):
1132         self.cleanup_x()
1133         if not config.KEEP_TEMP_FILES:
1134             for fn in os.listdir(self.workdir):
1135                 os.remove(os.path.join(self.workdir, fn))
1136             os.rmdir(self.workdir)
1137         else:
1138             log("NOT removing '%s', containing the following files:" % self.workdir)
1139             log(*os.listdir(self.workdir))
1140
1141         self.notify_watcher()
1142
1143
1144 def use_cache():
1145     return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1146
1147 def _read_cached_zip(server, book, max_age):
1148     #find a recent zip if possible
1149     prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1150     from glob import glob
1151     zips = sorted(glob(prefix + '*.zip'))
1152     if not zips:
1153         log("no cached booki-zips matching %s*.zip" % (prefix,))
1154         return None
1155     zipname = zips[-1]
1156     cutoff = time.time() - max_age * 60
1157     log(repr(zipname))
1158     try:
1159         date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1160         if date > cutoff:
1161             f = open(zipname)
1162             blob = f.read()
1163             f.close()
1164             return blob, zipname
1165         log("%s is too old, must reload" % zipname)
1166         return None
1167     except (IOError, IndexError, ValueError), e:
1168         log('could not make sense of %s: got exception %s' % (zipname, e))
1169         return None
1170
1171
1172 def fetch_zip(server, book, save=False, max_age=-1, filename=None):
1173     interface = config.SERVER_DEFAULTS[server].get('interface', 'Booki')
1174     try:
1175         url = config.ZIP_URLS[interface] % {'HTTP_HOST': HTTP_HOST,
1176                                             'server': server, 'book':book}
1177     except KeyError:
1178         raise NotImplementedError("Can't handle '%s' interface" % interface)
1179
1180     if use_cache() and max_age < 0:
1181         #default to 12 hours cache on objavi.halo.gen.nz
1182         max_age = 12 * 60
1183
1184     if max_age:
1185         log('WARNING: trying to use cached booki-zip',
1186             'If you are debugging booki-zip creation, you will go CRAZY'
1187             ' unless you switch this off')
1188         blob_and_name = _read_cached_zip(server, book, max_age)
1189         if blob_and_name is not None:
1190             return blob_and_name
1191
1192     log('fetching zip from %s'% url)
1193     f = urlopen(url)
1194     blob = f.read()
1195     f.close()
1196     if save:
1197         if filename is None:
1198             filename = '%s/%s' % (config.BOOKI_BOOK_DIR,
1199                                   make_book_name(book, server, '.zip'))
1200         f = open(filename, 'w')
1201         f.write(blob)
1202         f.close()
1203     return blob, filename
1204
1205
1206 def split_html(html, compressed_size=None, fix_markup=False):
1207     """Split long html files into pieces that will work nicely on a
1208     Sony Reader."""
1209     if compressed_size is None:
1210         import zlib
1211         compressed_size = len(zlib.compress(html))
1212
1213     splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1214                  len(html) // config.EPUB_FILE_SIZE_MAX)
1215     log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1216
1217     if not splits:
1218         return [html]
1219
1220     if fix_markup:
1221         #remove '<' in attributes etc, which makes the marker
1222         #insertion more reliable
1223         html = etree.tostring(lxml.html.fromstring(html),
1224                               encoding='UTF-8',
1225                               #method='html'
1226                               )
1227
1228     target = len(html) // (splits + 1)
1229     s = 0
1230     fragments = []
1231     for i in range(splits):
1232         e = html.find('<', target * (i + 1))
1233         fragments.append(html[s:e])
1234         fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS_SPLIT, i))
1235         s = e
1236     fragments.append(html[s:])
1237
1238     #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1239     chapters = split_tree(lxml.html.fromstring(''.join(fragments)))
1240     return [etree.tostring(c.tree, encoding='UTF-8', method='html') for c in chapters]
1241