objavi/fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This contains classes representing books and coordinates their processing.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Library module representing a complete FM book being turned into a
  21 PDF"""
  22
  23 import os, sys
  24 import tempfile
  25 import re, time
  26 import random
  27 from subprocess import Popen, check_call, PIPE
  28 from cStringIO import StringIO
  29 from urllib2 import urlopen, HTTPError
  30 import zipfile
  31 import traceback
  32 from string import ascii_letters
  33 from pprint import pformat
  34
  35 try:
  36     import simplejson as json
  37 except ImportError:
  38     import json
  39
  40 import lxml, lxml.html
  41 from lxml import etree
  42
  43 from objavi import config, epub_utils
  44 from objavi.cgi_utils import log, run, shift_file, make_book_name, guess_lang, guess_text_dir
  45 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
  46 from objavi.epub import add_guts, _find_tag
  47
  48 from iarchive import epub as ia_epub
  49 from booki.xhtml_utils import EpubChapter
  50 from booki.bookizip import get_metadata, add_metadata, clear_metadata, get_metadata_schemes
  51
  52 TMPDIR = os.path.abspath(config.TMPDIR)
  53 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  54 HTTP_HOST = os.environ.get('HTTP_HOST', '')
  55 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  56
  57
  58 def _get_best_title(tocpoint):
  59     if 'html_title' in tocpoint:
  60         return tocpoint['html_title']
  61     if 'title' in tocpoint:
  62         return tocpoint['title']
  63     return 'Untitled'
  64
  65
  66 def _add_initial_number(e, n):
  67     """Put a styled chapter number n at the beginning of element e."""
  68     initial = e.makeelement("strong", Class="initial")
  69     e.insert(0, initial)
  70     initial.tail = ' '
  71     if e.text is not None:
  72         initial.tail += e.text
  73     e.text = ''
  74     initial.text = "%s." % n
  75
  76 def expand_toc(toc, depth=1, index=0):
  77     """Reformat toc slightly for convenience"""
  78     for item in toc:
  79         url = item['url'].lstrip('/')
  80         bits = url.split('#', 1)
  81         filename = bits[0]
  82         fragment = (bits[1] if len(bits) == 2 else None)
  83         item['depth'] = depth
  84         item["filename"] = filename
  85         item["fragment"] = fragment
  86         item["index"] = index
  87         index += 1
  88         if 'children' in item:
  89             index = expand_toc(item['children'], depth + 1, index)
  90     return index
  91
  92 def _serialise(rtoc, stoc, depth):
  93     for item in rtoc:
  94         url = item['url'].lstrip('/')
  95         bits = url.split('#', 1)
  96         filename = bits[0]
  97         fragment = (bits[1] if len(bits) == 2 else None)
  98         stoc.append({"depth": depth,
  99                      "title": item['title'],
 100                      "url": url,
 101                      "filename": filename,
 102                      "fragment": fragment,
 103                      "type": item['type']
 104                      })
 105         if 'children' in item:
 106             _serialise(item['children'], stoc, depth + 1)
 107
 108
 109 def serialise_toc(rtoc):
 110     """Take the recursive TOC structure and turn it into a list of
 111     serial points.  Reformat some things for convenience."""
 112     stoc = []
 113     _serialise(rtoc, stoc, 1)
 114     for i, x in enumerate(stoc):
 115         x['position'] = i
 116     return stoc
 117
 118 def filename_toc_map(rtoc):
 119     tocmap = {}
 120     log(rtoc)
 121     def traverse(toc):
 122         for point in toc:
 123             log(point.keys())
 124             tocmap.setdefault(point['filename'], []).append(point)
 125             if 'children' in point:
 126                 traverse(point['children'])
 127     traverse(rtoc)
 128     return tocmap
 129
 130
 131 class Book(object):
 132     page_numbers = 'latin'
 133     preamble_page_numbers = 'roman'
 134
 135     def notify_watcher(self, message=None):
 136         if self.watcher:
 137             if  message is None:
 138                 #message is the name of the caller
 139                 message = traceback.extract_stack(None, 2)[0][2]
 140             log("notify_watcher called with '%s'" % message)
 141             self.watcher(message)
 142
 143     def __enter__(self):
 144         return self
 145
 146     def __exit__(self, exc_type, exc_value, traceback):
 147         self.notify_watcher('finished')
 148         self.cleanup()
 149         #could deal with exceptions here and return true
 150
 151
 152     def __init__(self, book, server, bookname, project=None,
 153                  page_settings=None, watcher=None, isbn=None,
 154                  license=config.DEFAULT_LICENSE, title=None,
 155                  max_age=0):
 156         log("*** Starting new book %s ***" % bookname,
 157             "starting zipbook with", server, book, project)
 158         self.watcher = watcher
 159         self.notify_watcher('start')
 160         self.bookname = bookname
 161         self.book = book
 162         self.server = server
 163         self.project = project
 164         self.cookie = ''.join(random.sample(ascii_letters, 10))
 165         try:
 166             blob = fetch_zip(server, book, project, save=True, max_age=max_age)
 167         except HTTPError, e:
 168             #log(e.url)
 169             traceback.print_exc()
 170             self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
 171             #not much to do?
 172             sys.exit()
 173         f = StringIO(blob)
 174         self.notify_watcher('fetch_zip')
 175         self.store = zipfile.ZipFile(f, 'r')
 176         self.info = json.loads(self.store.read('info.json'))
 177         for k in ('manifest', 'metadata', 'spine', 'TOC'):
 178             if k not in self.info:
 179                 raise ObjaviError('info.json of %s lacks vital element "%s"' %
 180                                   (bookname, k))
 181             #check types also?
 182
 183         self.metadata = self.info['metadata']
 184         self.spine = self.info['spine']
 185         self.manifest = self.info['manifest']
 186
 187         if server == config.LOCALHOST: # [DEPRECATED]
 188             server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
 189             book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
 190
 191         log(pformat(self.metadata))
 192         self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
 193         if not self.lang:
 194             self.lang = guess_lang(server, book)
 195             log('guessed lang as %s' % self.lang)
 196
 197         self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
 198         if not self.toc_header:
 199             self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
 200
 201         self.dir = get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0]
 202         if not self.dir:
 203             self.dir = guess_text_dir(server, book)
 204
 205
 206         #Patch in the extra metadata. (lang and dir may be set from config)
 207         #these should be read from zip -- so should go into zip?
 208         for var, key, scheme, ns in (
 209             (isbn, 'id', 'ISBN', config.DC),
 210             (license, 'rights', 'License', config.DC),
 211             (title, 'title', '', config.DC),
 212             (self.lang, 'language', '', config.DC),
 213             (self.dir, 'dir', '', config.FM),
 214             ):
 215             if var is not None:
 216                 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
 217
 218         self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
 219         self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
 220
 221         self.toc = self.info['TOC']
 222         expand_toc(self.toc)
 223
 224         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 225         os.chmod(self.workdir, 0755)
 226
 227         self.body_html_file = self.filepath('body.html')
 228         self.body_pdf_file = self.filepath('body.pdf')
 229         self.preamble_html_file = self.filepath('preamble.html')
 230         self.preamble_pdf_file = self.filepath('preamble.pdf')
 231         self.tail_html_file = self.filepath('tail.html')
 232         self.tail_pdf_file = self.filepath('tail.pdf')
 233         self.isbn_pdf_file = None
 234         self.pdf_file = self.filepath('final.pdf')
 235         self.body_odt_file = self.filepath('body.odt')
 236
 237         self.publish_file = os.path.join(PUBLISH_PATH, bookname)
 238         self.publish_url = os.path.join(config.PUBLISH_URL, bookname)
 239
 240         if page_settings is not None:
 241             self.maker = PageSettings(**page_settings)
 242
 243         titles = get_metadata(self.metadata, 'title')
 244         if titles:
 245             self.title = titles[0]
 246         else:
 247             self.title = 'A Manual About ' + self.book
 248
 249         self.notify_watcher()
 250
 251
 252     if config.TRY_BOOK_CLEANUP_ON_DEL:
 253         #Dont even define __del__ if it is not used.
 254         _try_cleanup_on_del = True
 255         def __del__(self):
 256             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 257                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 258                 self.cleanup()
 259
 260     def get_tree_by_id(self, id):
 261         """get an HTML tree from the given manifest ID"""
 262         name = self.manifest[id]['url']
 263         mimetype = self.manifest[id]['mimetype']
 264         s = self.store.read(name)
 265         f = StringIO(s)
 266         if mimetype == 'text/html':
 267             try:
 268                 tree = lxml.html.parse(f)
 269             except etree.XMLSyntaxError, e:
 270                 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
 271                     (id, name, s[:20], e))
 272                 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
 273         elif 'xml' in mimetype: #XXX or is this just asking for trouble?
 274             tree = etree.parse(f)
 275         else:
 276             tree = f.read()
 277         f.close()
 278         return tree
 279
 280     def filepath(self, fn):
 281         return os.path.join(self.workdir, fn)
 282
 283     def save_data(self, fn, data):
 284         """Save without tripping up on unicode"""
 285         if isinstance(data, unicode):
 286             data = data.encode('utf8', 'ignore')
 287         f = open(fn, 'w')
 288         f.write(data)
 289         f.close()
 290
 291     def save_tempfile(self, fn, data):
 292         """Save the data in a temporary directory that will be cleaned
 293         up when all is done.  Return the absolute file path."""
 294         fn = self.filepath(fn)
 295         self.save_data(fn, data)
 296         return fn
 297
 298     def make_oo_doc(self):
 299         """Make an openoffice document, using the html2odt script."""
 300         self.wait_for_xvfb()
 301         html_text = etree.tostring(self.tree, method="html")
 302         self.save_data(self.body_html_file, html_text)
 303         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 304         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 305         os.rename(self.body_odt_file, self.publish_file)
 306         self.notify_watcher()
 307
 308     def extract_pdf_outline(self):
 309         #self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
 310         debugf = self.filepath('outline.txt')
 311         self.outline_contents, self.outline_text, number_of_pages = \
 312                 parse_outline(self.body_pdf_file, 1, debugf)
 313
 314         if not self.outline_contents:
 315             #probably problems with international text. need a horrible hack
 316             log('no outline: trying again with ascii headings')
 317             import copy
 318             tree = copy.deepcopy(self.tree)
 319             titlemap = {}
 320             for tag in ('h1', 'h2', 'h3', 'h4'):
 321                 for i, e in enumerate(tree.getiterator(tag)):
 322                     key = "%s_%s" % (tag, i)
 323                     titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
 324                     del e[:]
 325                     if tag == 'h1':
 326                         e = lxml.etree.SubElement(e, "strong", Class="initial")
 327                     e.text = key
 328                     log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
 329
 330             ascii_html_file = self.filepath('body-ascii-headings.html')
 331             ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
 332             html_text = lxml.etree.tostring(tree, method="html")
 333             self.save_data(ascii_html_file, html_text)
 334             self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
 335             debugf = self.filepath('ascii_outline.txt')
 336             ascii_contents, ascii_text, number_of_ascii_pages = \
 337                 parse_outline(ascii_pdf_file, 1, debugf)
 338             self.outline_contents = []
 339             log ("number of pages: %s, post ascii: %s" %
 340                  (number_of_pages, number_of_ascii_pages))
 341             for ascii_title, depth, pageno in ascii_contents:
 342                 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
 343                     ascii_title = ascii_title[:-4]
 344                 if ' ' in ascii_title:
 345                     ascii_title = ascii_title.rsplit(' ', 1)[1]
 346                 title = titlemap.get(ascii_title, '')
 347                 log((ascii_title, title, depth, pageno))
 348
 349                 self.outline_contents.append((title, depth, pageno))
 350         else:
 351             for x in self.outline_contents:
 352                 log(x)
 353
 354         self.notify_watcher()
 355         return number_of_pages
 356
 357     def make_body_pdf(self):
 358         """Make a pdf of the HTML, using webkit"""
 359         #1. Save the html
 360         html_text = etree.tostring(self.tree, method="html")
 361         self.save_data(self.body_html_file, html_text)
 362
 363         #2. Make a pdf of it
 364         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
 365         self.notify_watcher('generate_pdf')
 366
 367         n_pages = self.extract_pdf_outline()
 368
 369         log ("found %s pages in pdf" % n_pages)
 370         #4. resize pages, shift gutters, even pages
 371         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 372         self.notify_watcher('reshape_pdf')
 373
 374         #5 add page numbers
 375         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 376                               numbers=self.page_numbers)
 377         self.notify_watcher("number_pdf")
 378         self.notify_watcher()
 379
 380     def make_preamble_pdf(self):
 381         contents = self.make_contents()
 382         inside_cover_html = self.compose_inside_cover()
 383         log(self.dir, self.css_url, self.title, inside_cover_html,
 384             self.toc_header, contents, self.title)
 385
 386         html = ('<html dir="%s"><head>\n'
 387                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 388                 '<link rel="stylesheet" href="%s" />\n'
 389                 '</head>\n<body>\n'
 390                 '<h1 class="frontpage">%s</h1>'
 391                 '%s\n'
 392                 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
 393                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 394                 '<!--%s--></div></body></html>'
 395                 ) % (self.dir, self.css_url, self.title, inside_cover_html.decode('utf-8'),
 396                      self.toc_header, contents, self.title)
 397         self.save_data(self.preamble_html_file, html)
 398
 399         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 400
 401         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 402
 403         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 404                             numbers=self.preamble_page_numbers,
 405                             number_start=-2)
 406
 407         self.notify_watcher()
 408
 409     def make_end_matter_pdf(self):
 410         """Make an inside back cover and a back cover.  If there is an
 411         isbn number its barcode will be put on the back cover."""
 412         if self.isbn:
 413             self.isbn_pdf_file = self.filepath('isbn.pdf')
 414             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 415             self.notify_watcher('make_barcode_pdf')
 416
 417         end_matter = self.compose_end_matter()
 418         log(end_matter)
 419         self.save_data(self.tail_html_file, end_matter.decode('utf-8'))
 420         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 421
 422         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 423                                centre_end=True, even_pages=False)
 424         self.notify_watcher()
 425
 426     def make_book_pdf(self):
 427         """A convenient wrapper of a few necessary steps"""
 428         # now the Xvfb server is needed. make sure it has had long enough to get going
 429         self.wait_for_xvfb()
 430         self.make_body_pdf()
 431         self.make_preamble_pdf()
 432         self.make_end_matter_pdf()
 433
 434         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 435                     self.body_pdf_file, self.tail_pdf_file,
 436                     self.isbn_pdf_file)
 437
 438         self.notify_watcher('concatenated_pdfs')
 439
 440
 441     def make_simple_pdf(self, mode):
 442         """Make a simple pdf document without contents or separate
 443         title page.  This is used for multicolumn newspapers and for
 444         web-destined pdfs."""
 445         self.wait_for_xvfb()
 446         #0. Add heading to begining of html
 447         body = list(self.tree.cssselect('body'))[0]
 448         e = body.makeelement('h1', {'id': 'book-title'})
 449         e.text = self.title
 450         body.insert(0, e)
 451         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 452         e.addnext(intro)
 453
 454         #0.5 adjust parameters to suit the particular kind of output
 455         if mode == 'web':
 456             self.maker.gutter = 0
 457
 458         #1. Save the html
 459         html_text = etree.tostring(self.tree, method="html")
 460         self.save_data(self.body_html_file, html_text)
 461
 462         #2. Make a pdf of it (direct to to final pdf)
 463         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
 464         self.notify_watcher('generate_pdf')
 465         n_pages = count_pdf_pages(self.pdf_file)
 466
 467         if mode != 'web':
 468             #3. resize pages and shift gutters.
 469             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 470             self.notify_watcher('reshape_pdf')
 471
 472             #4. add page numbers
 473             self.maker.number_pdf(self.pdf_file, n_pages,
 474                                   dir=self.dir, numbers=self.page_numbers)
 475             self.notify_watcher("number_pdf")
 476         self.notify_watcher()
 477
 478
 479     def rotate180(self):
 480         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 481         presses."""
 482         rotated = self.filepath('final-rotate.pdf')
 483         unrotated = self.filepath('final-pre-rotate.pdf')
 484         #leave the unrotated pdf intact at first, in case of error.
 485         rotate_pdf(self.pdf_file, rotated)
 486         os.rename(self.pdf_file, unrotated)
 487         os.rename(rotated, self.pdf_file)
 488         self.notify_watcher()
 489
 490     def publish_pdf(self):
 491         """Move the finished PDF to its final resting place"""
 492         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 493         os.rename(self.pdf_file, self.publish_file)
 494         self.notify_watcher()
 495
 496
 497     def concat_html(self):
 498         """Join all the chapters together into one tree.  Keep the TOC
 499         up-to-date along the way."""
 500
 501         #each manifest item looks like:
 502         #{'contributors': []
 503         #'license': [],
 504         #'mimetype': '',
 505         #'rightsholders': []
 506         #'url': ''}
 507         doc = lxml.html.document_fromstring('<html><body></body></html>')
 508         tocmap = filename_toc_map(self.toc)
 509         for ID in self.spine:
 510             details = self.manifest[ID]
 511             log(ID, pformat(details))
 512             # ACO MIJENJAO
 513             try:
 514                 root = self.get_tree_by_id(ID).getroot()
 515             except:
 516                 continue
 517             #handle any TOC points in this file
 518             for point in tocmap[details['url']]:
 519                 #if the url has a #identifier, use it. Otherwise, make
 520                 #one up, using a hidden element at the beginning of
 521                 #the inserted document.
 522                 #XXX this will break if different files use the same ids
 523                 #XXX should either replace all, or replace selectively.
 524                 if point['fragment']:
 525                     fragment = point['fragment']
 526                 else:
 527                     body = _find_tag(root, 'body')
 528                     fragment = '%s_%s' % (self.cookie, point['index'])
 529                     #reuse first tag if it is suitable.
 530                     if (len(body) and
 531                         body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
 532                         if body[0].get('id') is None:
 533                             body[0].set('id', fragment)
 534                         else:
 535                             fragment = body[0].get('id')
 536                         #the chapter starts with a heading. that heading should be the chapter name.
 537                         if body[0].tag in ('h1', 'h2', 'h3'):
 538                             log('chapter has title "%s", found html title "%s"' %
 539                                 (point['title'], body[0].text_content()))
 540                             point['html_title'] = body[0].text_content()
 541                     else:
 542                         marker = body.makeelement('div', style="display:none",
 543                                                   id=fragment)
 544                         body.insert(0, marker)
 545                 point['html_id'] = fragment
 546
 547             add_guts(root, doc)
 548         return doc
 549
 550     def unpack_static(self):
 551         """Extract static files from the zip for the html to refer to."""
 552         static_files = [x['url'] for x in self.manifest.values()
 553                         if x['url'].startswith('static')]
 554         if static_files:
 555             os.mkdir(self.filepath('static'))
 556
 557         for name in static_files:
 558             s = self.store.read(name)
 559             f = open(self.filepath(name), 'w')
 560             f.write(s)
 561             f.close()
 562         self.notify_watcher()
 563
 564     def load_book(self):
 565         """"""
 566         #XXX concatenate the HTML to match how TWiki version worked.
 567         # This is perhaps foolishly early -- throwing away useful boundaries.
 568         self.unpack_static()
 569         self.tree = self.concat_html()
 570         self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
 571
 572         self.headings = [x for x in self.tree.cssselect('h1')]
 573         if self.headings:
 574             self.headings[0].set('class', "first-heading")
 575         for h1 in self.headings:
 576             h1.title = h1.text_content().strip()
 577         self.notify_watcher()
 578
 579     def make_contents(self):
 580         """Generate HTML containing the table of contents.  This can
 581         only be done after the main PDF has been made, because the
 582         page numbers are contained in the PDF outline."""
 583         header = '<h1>Table of Contents</h1><table class="toc">\n'
 584         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 585                     '<td class="pagenumber">%s</td></tr>\n')
 586         empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
 587         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 588         footer = '\n</table>'
 589
 590         contents = []
 591
 592         chapter = 1
 593         page_num = 1
 594         subsections = [] # for the subsection heading pages.
 595
 596         outline_contents = iter(self.outline_contents)
 597         headings = iter(self.headings)
 598
 599         for section in self.toc:
 600             if not section.get('children'):
 601                 contents.append(empty_section_tmpl % section['title'])
 602                 continue
 603             contents.append(section_tmpl % section['title'])
 604
 605             for point in section['children']:
 606                 try:
 607                     h1_text, level, page_num = outline_contents.next()
 608                 except StopIteration:
 609                     log("contents data not found for %s. Stopping" % (point,))
 610                     break
 611                 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
 612                 chapter += 1
 613
 614         doc = header + '\n'.join(contents) + footer
 615         self.notify_watcher()
 616         return doc
 617
 618     def add_section_titles(self):
 619         """Add any section heading pages that the TOC.txt file
 620         specifies.  These are sub-book, super-chapter groupings.
 621
 622         Also add initial numbers to chapters.
 623         """
 624         headings = iter(self.headings)
 625         chapter = 1
 626         section = None
 627         log(self.toc)
 628         for t in self.toc:
 629             #only top level sections get a subsection page,
 630             #and only if they have children.
 631             if t.get('children'):
 632                 section = self.tree.makeelement('div', Class="objavi-subsection")
 633                 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
 634                 heading.text = t['title']
 635                 for child in t['children']:
 636                     item = etree.SubElement(section, 'div', Class="objavi-chapter")
 637                     if 'html_title' in child:
 638                         item.text = child['html_title']
 639                         heading = self.tree.cssselect('#'+ child['html_id'])
 640                         if heading:
 641                             _add_initial_number(heading[0], chapter)
 642                     else:
 643                         item.text = child['title']
 644                     _add_initial_number(item, chapter)
 645                     log(item.text, debug='HTMLGEN')
 646                     chapter += 1
 647                 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
 648                 location = self.tree.cssselect('#'+ t['html_id'])[0]
 649                 location.addprevious(section)
 650
 651
 652         self.notify_watcher()
 653
 654
 655     def add_css(self, css=None, mode='book'):
 656         """If css looks like a url, use it as a stylesheet link.
 657         Otherwise it is the CSS itself, which is saved to a temporary file
 658         and linked to."""
 659         log("css is %r" % css)
 660         htmltree = self.tree
 661         if css is None or not css.strip():
 662             defaults = config.SERVER_DEFAULTS[self.server]
 663             url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
 664         elif not re.match(r'^http://\S+$', css):
 665             fn = self.save_tempfile('objavi.css', css)
 666             url = 'file://' + fn
 667         else:
 668             url = css
 669         #XXX for debugging and perhaps sensible anyway
 670         #url = url.replace('file:///home/douglas/objavi2', '')
 671
 672
 673         #find the head -- it's probably first child but lets not assume.
 674         for child in htmltree:
 675             if child.tag == 'head':
 676                 head = child
 677                 break
 678         else:
 679             head = htmltree.makeelement('head')
 680             htmltree.insert(0, head)
 681
 682         link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 683         self.css_url = url
 684         self.notify_watcher()
 685         return url
 686
 687
 688     def _read_localised_template(self, template, fallbacks=['en']):
 689         """Try to get the template in the approriate language, otherwise in english."""
 690         for lang in [self.lang] + fallbacks:
 691             try:
 692                 fn = template % (lang)
 693                 f = open(fn)
 694                 break
 695             except IOError, e:
 696                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 697                 log(e)
 698         template = f.read()
 699         f.close()
 700         return template
 701
 702     def compose_inside_cover(self):
 703         """create the markup for the preamble inside cover."""
 704         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 705
 706         if self.isbn:
 707             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 708         else:
 709             isbn_text = ''
 710
 711         return template % {'date': time.strftime('%Y-%m-%d'),
 712                            'isbn': isbn_text,
 713                            'license': self.license,
 714                            }
 715
 716
 717     def compose_end_matter(self):
 718         """create the markup for the end_matter inside cover.  If
 719         self.isbn is not set, the html will result in a pdf that
 720         spills onto two pages.
 721         """
 722         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 723
 724         d = {'css_url': self.css_url,
 725              'title': self.title
 726              }
 727
 728         if self.isbn:
 729             d['inside_cover_style'] = ''
 730         else:
 731             d['inside_cover_style'] = 'page-break-after: always'
 732
 733         return template % d
 734
 735
 736     def make_epub(self, use_cache=False):
 737         """Make an epub version of the book, using Mike McCabe's
 738         epub module for the Internet Archive."""
 739         ebook = ia_epub.Book(self.publish_file, content_dir='')
 740         def add_file(ID, filename, mediatype, content):
 741             ebook.add_content({'media-type': mediatype.encode('utf-8'),
 742                                'id': ID.encode('utf-8'),
 743                                'href': filename.encode('utf-8'),
 744                                }, content)
 745
 746         toc = self.info['TOC']
 747
 748         #manifest
 749         filemap = {} #map html to corresponding xhtml
 750         spinemap = {} #map IDs to multi-file chapters
 751         for ID in self.manifest:
 752             details = self.manifest[ID]
 753             log(ID, pformat(details))
 754             fn, mediatype = details['url'], details['mimetype']
 755             content = self.store.read(fn)
 756             if mediatype == 'text/html':
 757                 #convert to application/xhtml+xml, and perhaps split
 758                 c = EpubChapter(self.server, self.book, ID, content,
 759                                 use_cache=use_cache)
 760                 c.remove_bad_tags()
 761                 if fn[-5:] == '.html':
 762                     fnbase = fn[:-5]
 763                 else:
 764                     fnbase = fn
 765                 fnx = fnbase + '.xhtml'
 766                 mediatype = 'application/xhtml+xml'
 767
 768                 fragments = split_html(c.as_xhtml(),
 769                                        compressed_size=self.store.getinfo(fn).compress_size)
 770
 771                 #add the first one as if it is the whole thing (as it often is)
 772                 add_file(ID, fnx, mediatype, fragments[0])
 773                 filemap[fn] = fnx
 774                 if len(fragments) > 1:
 775                     spine_ids = [ID]
 776                     spinemap[ID] = spine_ids
 777                     #add any extras
 778                     for i in range(1, len(fragments)):
 779                         # XXX it is possible for duplicates if another
 780                         # file happens to have this name. Ignore for now
 781                         _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
 782                         spine_ids.append(_id)
 783                         add_file(_id,
 784                                  '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
 785                                  mediatype, fragments[i])
 786
 787             else:
 788                 add_file(ID, fn, mediatype, content)
 789
 790         #toc
 791         ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
 792         ebook.add(ebook.content_dir + 'toc.ncx', ncx)
 793
 794         #spine
 795         for ID in self.spine:
 796             if ID in spinemap:
 797                 for x in spinemap[ID]:
 798                     ebook.add_spine_item({'idref': x})
 799             else:
 800                 ebook.add_spine_item({'idref': ID})
 801
 802         #metadata -- no use of attributes (yet)
 803         # and fm: metadata disappears for now
 804         DCNS = config.DCNS
 805         DC = config.DC
 806         meta_info_items = []
 807         for ns, namespace in self.metadata.items():
 808             for keyword, schemes in namespace.items():
 809                 if ns:
 810                     keyword = '{%s}%s' % (ns, keyword)
 811                 for scheme, values in schemes.items():
 812                     for value in values:
 813                         item = {
 814                             'item': keyword,
 815                             'text': value,
 816                             }
 817                         if scheme:
 818                             if keyword in (DCNS + 'creator', DCNS + 'contributor'):
 819                                 item['atts'] = {'role': scheme}
 820                             else:
 821                                 item['atts'] = {'scheme': scheme}
 822
 823         has_authors = 'creator' in self.metadata[DC]
 824         if not has_authors and config.CLAIM_UNAUTHORED:
 825             meta_info_items.append({'item': DCNS + 'creator',
 826                                     'text': 'The Contributors'})
 827
 828             meta_info_items.append({'item': DCNS + 'rights',
 829                                     'text': 'This book is free. Copyright %s' % (', '.join(authors))}
 830                                    )
 831
 832         tree_str = ia_epub.make_opf(meta_info_items,
 833                                     ebook.manifest_items,
 834                                     ebook.spine_items,
 835                                     ebook.guide_items,
 836                                     ebook.cover_id)
 837         ebook.add(ebook.content_dir + 'content.opf', tree_str)
 838         ebook.z.close()
 839         self.notify_watcher()
 840
 841
 842     def publish_s3(self):
 843         """Push the book's epub to archive.org, using S3."""
 844         #XXX why only epub?
 845         secrets = {}
 846         for x in ('S3_SECRET', 'S3_ACCESSKEY'):
 847             fn = getattr(config, x)
 848             f = open(fn)
 849             secrets[x] = f.read().strip()
 850             f.close()
 851
 852         log(secrets)
 853         now = time.strftime('%F')
 854         s3output = self.filepath('s3-output.txt')
 855         s3url = 'http://s3.us.archive.org/booki-%s/%s' % (self.book, self.bookname)
 856         detailsurl = 'http://archive.org/details/booki-%s' % (self.book,)
 857         headers = [
 858             'x-amz-auto-make-bucket:1',
 859             "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
 860             'x-archive-meta-mediatype:texts',
 861             'x-archive-meta-collection:opensource',
 862             'x-archive-meta-title:%s' %(self.book,),
 863             'x-archive-meta-date:%s' % (now,),
 864             'x-archive-meta-creator:FLOSS Manuals Contributors',
 865             ]
 866
 867         if self.license in config.LICENSES:
 868             headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
 869
 870         argv = ['curl', '--location', '-s', '-o', s3output]
 871         for h in headers:
 872             argv.extend(('--header', h))
 873         argv.extend(('--upload-file', self.publish_file, s3url,))
 874
 875         log(' '.join(repr(x) for x in argv))
 876         check_call(argv, stdout=sys.stderr)
 877         self.notify_watcher()
 878         return detailsurl, s3url
 879
 880
 881     def spawn_x(self):
 882         """Start an Xvfb instance, using a new server number.  A
 883         reference to it is stored in self.xvfb, which is used to kill
 884         it when the pdf is done.
 885
 886         Note that Xvfb doesn't interact well with dbus which is
 887         present on modern desktops.
 888         """
 889         #Find an unused server number (in case two cgis are running at once)
 890         while True:
 891             servernum = random.randrange(50, 500)
 892             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 893                 break
 894
 895         self.xserver_no = ':%s' % servernum
 896
 897         authfile = self.filepath('Xauthority')
 898         os.environ['XAUTHORITY'] = authfile
 899
 900         #mcookie(1) eats into /dev/random, so avoid that
 901         from hashlib import md5
 902         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 903         mcookie = m.hexdigest()
 904
 905         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 906
 907         self.xvfb = Popen(['Xvfb', self.xserver_no,
 908                            '-screen', '0', '1024x768x24',
 909                            '-pixdepths', '32',
 910                            #'-blackpixel', '0',
 911                            #'-whitepixel', str(2 ** 24 -1),
 912                            #'+extension', 'Composite',
 913                            '-dpi', '96',
 914                            '-kb',
 915                            '-nolisten', 'tcp',
 916                            ])
 917
 918         # We need to wait a bit before the Xvfb is ready.  but the
 919         # downloads are so slow that that probably doesn't matter
 920
 921         self.xvfb_ready_time = time.time() + 2
 922
 923         os.environ['DISPLAY'] = self.xserver_no
 924         log(self.xserver_no)
 925
 926     def wait_for_xvfb(self):
 927         """wait until a previously set time before continuing.  This
 928         is so Xvfb has time to properly start."""
 929         if hasattr(self, 'xvfb'):
 930             d = self.xvfb_ready_time - time.time()
 931             if d > 0:
 932                 time.sleep(d)
 933                 self.notify_watcher()
 934
 935     def cleanup_x(self):
 936         """Try very hard to kill off Xvfb.  In addition to killing
 937         this instance's xvfb, occasionally (randomly) search for
 938         escaped Xvfb instances and kill those too."""
 939         if not hasattr(self, 'xvfb'):
 940             return
 941         check_call(['xauth', 'remove', self.xserver_no])
 942         p = self.xvfb
 943         log("trying to kill Xvfb %s" % p.pid)
 944         os.kill(p.pid, 15)
 945         for i in range(10):
 946             if p.poll() is not None:
 947                 log("%s died with %s" % (p.pid, p.poll()))
 948                 break
 949             log("%s not dead yet" % p.pid)
 950             time.sleep(0.2)
 951         else:
 952             log("Xvfb would not die! kill -9! kill -9!")
 953             os.kill(p.pid, 9)
 954
 955         if random.random() < 0.1:
 956             # occasionally kill old xvfbs and soffices, if there are any.
 957             self.kill_old_processes()
 958
 959     def kill_old_processes(self):
 960         """Sometimes, despite everything, Xvfb or soffice instances
 961         hang around well after they are wanted -- for example if the
 962         cgi process dies particularly badly. So kill them if they have
 963         been running for a long time."""
 964         log("running kill_old_processes")
 965         p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
 966                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 967         data = p.communicate()[0].strip()
 968         if data:
 969             lines = data.split('\n')
 970             pids = []
 971             for line in lines:
 972                 log('dealing with ps output "%s"' % line)
 973                 try:
 974                     pid, days, hours, minutes, seconds \
 975                          = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
 976                 except AttributeError:
 977                     log("Couldn't parse that line!")
 978                 # 50 minutes should be enough xvfb time for anyone
 979                 if days or hours or int(minutes) > 50:
 980                     pid = int(pid)
 981                     log("going to kill pid %s" % pid)
 982                     os.kill(pid, 15)
 983                     pids.append(pid)
 984
 985             time.sleep(1.0)
 986             for pid in pids:
 987                 #try again in case any are lingerers
 988                 try:
 989                     os.kill(int(pid), 9)
 990                 except OSError, e:
 991                     log('PID %s seems dead (re-kill gives %s)' % (pid, e))
 992                     continue
 993                 log('killing %s with -9' % pid)
 994         self.notify_watcher()
 995
 996     def cleanup(self):
 997         self.cleanup_x()
 998         if not config.KEEP_TEMP_FILES:
 999             for fn in os.listdir(self.workdir):
1000                 os.remove(os.path.join(self.workdir, fn))
1001             os.rmdir(self.workdir)
1002         else:
1003             log("NOT removing '%s', containing the following files:" % self.workdir)
1004             log(*os.listdir(self.workdir))
1005
1006         self.notify_watcher()
1007
1008
1009 def use_cache():
1010     return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1011
1012 def _read_cached_zip(server, book, max_age):
1013     #find a recent zip if possible
1014     prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1015     from glob import glob
1016     zips = sorted(glob(prefix + '*.zip'))
1017     if not zips:
1018         log("no cached booki-zips matching %s*.zip" % (prefix,))
1019         return None
1020     zipname = zips[-1]
1021     cutoff = time.time() - max_age * 60
1022     log(repr(zipname))
1023     try:
1024         date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1025         if date > cutoff:
1026             f = open(zipname)
1027             blob = f.read()
1028             f.close()
1029             return blob
1030         log("%s is too old, must reload" % zipname)
1031         return None
1032     except (IOError, IndexError, ValueError), e:
1033         log('could not make sense of %s: got exception %s' % (zipname, e))
1034         return None
1035
1036
1037
1038 def fetch_zip(server, book, project, save=False, max_age=-1):
1039     interface = config.SERVER_DEFAULTS[server]['interface']
1040     if interface not in ('Booki', 'TWiki'):
1041         raise NotImplementedError("Can't handle '%s' interface" % interface)
1042     if interface == 'Booki':
1043         url = config.BOOKI_ZIP_URL  % {'server': server, 'project': project, 'book':book}
1044     else:
1045         url = config.TWIKI_GATEWAY_URL % (HTTP_HOST, server, book)
1046
1047     if use_cache() and max_age < 0:
1048         #default to 12 hours cache on objavi.halo.gen.nz
1049         max_age = 12 * 60
1050
1051     if max_age:
1052         log('WARNING: trying to use cached booki-zip',
1053             'If you are debugging booki-zip creation, you will go CRAZY'
1054             ' unless you switch this off')
1055         blob = _read_cached_zip(server, book, max_age)
1056         if blob is not None:
1057             return blob
1058
1059     log('fetching zip from %s'% url)
1060     f = urlopen(url)
1061     blob = f.read()
1062     f.close()
1063     if save:
1064         zipname = make_book_name(book, server, '.zip')
1065         f = open('%s/%s' % (config.BOOKI_BOOK_DIR, zipname), 'w')
1066         f.write(blob)
1067         f.close()
1068     return blob
1069
1070
1071
1072 def split_html(html, compressed_size=None, xhtmlise=False):
1073     if compressed_size is None:
1074         import zlib
1075         compressed_size = len(zlib.compress(html))
1076
1077     splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1078                  len(html) // config.EPUB_FILE_SIZE_MAX)
1079     log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1080
1081     if not splits:
1082         return [html]
1083
1084     if xhtmlise:
1085         #xhtmlisation removes '<' in attributes etc, which makes the
1086         #marker insertion more reliable
1087         html = etree.tostring(lxml.html.fromstring(html),
1088                               encoding='UTF-8',
1089                               #method='html'
1090                               )
1091
1092     target = len(html) // (splits + 1)
1093     s = 0
1094     fragments = []
1095     for i in range(splits):
1096         e = html.find('<', target * (i + 1))
1097         fragments.append(html[s:e])
1098         fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS, i))
1099         s = e
1100     fragments.append(html[s:])
1101     root = lxml.html.fromstring(''.join(fragments))
1102
1103     # find the node lineages along which to split the document.
1104     # anything outside these lines (i.e., branches) can be copied
1105     # wholesale.
1106
1107     stacks = []
1108     for hr in root.iter(tag='hr'):
1109         if hr.get('class') == config.MARKER_CLASS:
1110             stack = [hr]
1111             stack.extend(x for x in hr.iterancestors())
1112             stack.reverse()
1113             stacks.append(stack)
1114
1115     iterstacks = iter(stacks)
1116
1117     src = root
1118     log('root is', root, root.attrib, type(root.attrib))
1119     dest = lxml.html.Element(root.tag, **dict(root.items()))
1120     doc = dest
1121     stack = iterstacks.next()
1122     marker = stack[-1]
1123
1124     chapters = []
1125     try:
1126         while True:
1127             for e in src:
1128                 if e not in stack:
1129                     #cut and paste branch
1130                     dest.append(e)
1131                 elif e is marker:
1132                     #got one
1133                     src.remove(e)
1134                     chapters.append(doc)
1135                     src = root
1136                     dest = lxml.html.Element(root.tag, **dict(root.items()))
1137                     doc = dest
1138                     stack = iterstacks.next()
1139                     marker = stack[-1]
1140                     break
1141                 else:
1142                     #next level
1143                     dest = etree.SubElement(dest, e.tag, **dict(e.items()))
1144                     dest.text = e.text
1145                     e.text = None
1146                     src = e
1147                     break
1148     except StopIteration:
1149         #stacks have run out -- the rest of the tree is the last section
1150         chapters.append(src)
1151
1152     #return chapters
1153     return [etree.tostring(c, encoding='UTF-8', method='html') for c in chapters]
1154