fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This provides abstractions of texts and virtual printers and manages
   3 # their interactions.
   4 #
   5 # Copyright (C) 2009 Douglas Bagnall
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with this program; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20
  21 """Library module representing a complete FM book being turned into a
  22 PDF"""
  23
  24 import os, sys
  25 import tempfile
  26 import re, time
  27 import random
  28 from urllib2 import urlopen
  29 from subprocess import Popen, check_call, PIPE
  30
  31 import lxml.etree, lxml.html
  32 import lxml, lxml.html, lxml.etree
  33
  34 import config
  35 from config import SERVER_DEFAULTS, DEFAULT_SERVER, POINT_2_MM, PDFEDIT_MAX_PAGES
  36
  37 TMPDIR = os.path.abspath(config.TMPDIR)
  38 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  39 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  40
  41
  42 def log(*messages, **kwargs):
  43     """Send the messages to the appropriate place (stderr, or syslog).
  44     If a <debug> keyword is specified, the message is only printed if
  45     its value ias in the global DEBUG_MODES."""
  46     if 'debug' not in kwargs or config.DEBUG_ALL or kwargs['debug'] in config.DEBUG_MODES:
  47         for m in messages:
  48             try:
  49                 print >> sys.stderr, m
  50             except Exception:
  51                 print >> sys.stderr, repr(m)
  52
  53 def _add_initial_number(e, n):
  54     """Put a styled chapter number n at the beginning of element e."""
  55     initial = e.makeelement("strong", Class="initial")
  56     e.insert(0, initial)
  57     initial.tail = ' '
  58     if e.text is not None:
  59         initial.tail += e.text
  60     e.text = ''
  61     initial.text = "%s." % n
  62
  63 def _add_chapter_cookie(e):
  64     """add magic hidden text to help with contents generation"""
  65     cookie = e.makeelement("span", Class="heading-cookie", dir="ltr",
  66                            style="font-size:6pt; line-height: 6pt; color: #fff; width:0;"
  67                            " float:left; margin:-2em; z-index: -67; display: block;"
  68                            )
  69     cookie.text = ''.join(random.choice(config.CHAPTER_COOKIE_CHARS) for x in range(8))
  70     e.cookie = cookie.text
  71     e.addnext(cookie)
  72     #e.append(cookie)
  73
  74
  75 class TocItem(object):
  76     """This makes sense of the tuples from TOC.txt files"""
  77     def __init__(self, status, chapter, title):
  78         # status is
  79         #  0 - section heading with no chapter
  80         #  1 - chapter heading
  81         #  2 - book title
  82         #
  83         # chapter is twiki name of the chapter
  84         # title is a human readable name of the chapter.
  85         self.status = status
  86         self.chapter = chapter
  87         self.title = title
  88
  89     def is_chapter(self):
  90         return self.status == '1'
  91
  92     def is_section(self):
  93         return self.status == '0'
  94
  95     def __str__(self):
  96         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
  97
  98
  99 def run(cmd):
 100     try:
 101         p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 102         out, err = p.communicate()
 103     except Exception:
 104         log("Failed on command: %r" % cmd)
 105         raise
 106     log("%s\n%s returned %s and produced\nstdout:%s\nstderr:%s" %
 107         (' '.join(cmd), cmd[0], p.poll(), out, err))
 108
 109
 110 def find_containing_paper(w, h):
 111     size = None
 112     for name, pw, ph in config.PAPER_SIZES:
 113         if pw >= w and ph >= h:
 114             mw = (pw - w) * 0.5
 115             mh = (ph - h) * 0.5
 116             return (name, mw, mh)
 117
 118     raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
 119                      (w * POINT_2_MM, h * POINT_2_MM))
 120
 121
 122
 123 class PageSettings(object):
 124     """Calculates and wraps commands for the generation and processing
 125     of PDFs"""
 126     def __init__(self, pointsize, **kwargs):
 127         # the formulas for default gutters, margins and column margins
 128         # are quite ad-hoc and certainly improvable.
 129
 130         self.width, self.height = pointsize
 131         self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
 132
 133         self.gutter = kwargs.get('gutter', (config.BASE_GUTTER +
 134                                             config.PROPORTIONAL_GUTTER * self.width))
 135
 136         default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
 137         self.top_margin = kwargs.get('top_margin', default_margin)
 138         self.side_margin = kwargs.get('top_margin', default_margin)
 139         self.bottom_margin = kwargs.get('top_margin', default_margin)
 140         self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
 141         self.columns = kwargs.get('columns', 1)
 142
 143         self.column_margin = kwargs.get('column_margin', default_margin * 2 / (4.0 + self.columns))
 144
 145         self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
 146         self.number_margin = self.side_margin
 147
 148         # calculate margins in mm for browsers
 149         self.margins = []
 150         for m, clip in ((self.top_margin, clipy),
 151                         (self.side_margin, clipx + 0.5 * self.gutter),
 152                         (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
 153                         (self.side_margin, clipx + 0.5 * self.gutter),
 154                         ):
 155             if m is None:
 156                 m = default_margin
 157             self.margins.append((m + clip) * POINT_2_MM)
 158
 159         for x in locals().iteritems():
 160             log("%s: %s" % x, debug='PDFGEN')
 161         for x in dir(self):
 162             log("%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
 163
 164
 165
 166     def _webkit_command(self, html, pdf):
 167         m = [str(x) for x in self.margins]
 168         cmd = [config.WKHTMLTOPDF, '-q', '-s', self.papersize,
 169                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
 170                ] + config.WKHTMLTOPDF_EXTRA_COMMANDS + [
 171                html, pdf]
 172         log(' '.join(cmd))
 173         return cmd
 174
 175     def _gecko_command(self, html, pdf):
 176         m = [str(x) for x in self.margins]
 177         #firefox -P pdfprint -print URL -printprinter "printer_settings"
 178         cmd = [FIREFOX, '-P', 'pdfprint', '-print',
 179                html, '-printprinter', self.moz_printer]
 180         log(' '.join(cmd))
 181         return cmd
 182
 183     def make_raw_pdf(self, html, pdf, engine='webkit'):
 184         func = getattr(self, '_%s_command' % engine)
 185         if self.columns == 1:
 186             cmd = func(html, pdf)
 187             run(cmd)
 188         else:
 189             printable_width = self.width - 2.0 * self.side_margin - self.gutter
 190             column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
 191             page_width = column_width + self.column_margin
 192
 193             columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
 194                                        gutter=0, top_margin=self.top_margin,
 195                                        side_margin=self.column_margin * 0.5,
 196                                        bottom_margin=self.bottom_margin)
 197
 198             column_pdf = pdf[:-4] + '-single-column.pdf'
 199             columnmaker.make_raw_pdf(html, column_pdf, engine=engine)
 200             columnmaker.reshape_pdf(column_pdf)
 201
 202             cmd = ['pdfnup',
 203                    '--nup', '%sx1' % int(self.columns),
 204                    '--paper', self.papersize.lower() + 'paper',
 205                    '--outfile', pdf,
 206                    '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
 207                    '--noautoscale', 'true',
 208                    '--orient', 'portrait',
 209                    #'--tidy', 'false',
 210                    column_pdf
 211                    ]
 212             run(cmd)
 213
 214
 215
 216     def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
 217                     even_pages=True):
 218         """Spin the pdf for RTL text, resize it to the right size, and
 219         shift the gutter left and right"""
 220         ops = 'resize'
 221         if self.gutter:
 222             ops += ',shift'
 223         if even_pages:
 224             ops += ',even_pages'
 225         gutter = self.gutter
 226         if dir == 'RTL':
 227             gutter = -gutter
 228         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 229                'dir=%s' % dir,
 230                'filename=%s' % pdf,
 231                'output_filename=%s' % pdf,
 232                'operation=%s' % ops,
 233                'width=%s' % self.width,
 234                'height=%s' % self.height,
 235                'offset=%s' % gutter,
 236                'centre_start=%s' % centre_start,
 237                'centre_end=%s' % centre_end,
 238                ]
 239         run(cmd)
 240
 241     def _number_pdf(self, pdf, numbers='latin', dir='LTR',
 242                     number_start=1):
 243         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 244                'operation=page_numbers',
 245                'dir=%s' % dir,
 246                'filename=%s' % pdf,
 247                'output_filename=%s' % pdf,
 248                'number_start=%s' % number_start,
 249                'number_style=%s' % numbers,
 250                'number_bottom=%s' % self.number_bottom,
 251                'number_margin=%s' % self.number_margin,
 252                ]
 253         run(cmd)
 254
 255     def number_pdf(self, pdf, pages, **kwargs):
 256         # if there are too many pages for pdfedit to handle in one go,
 257         # split the job into bits.  <pages> may not be exact
 258         if pages is None or pages <= PDFEDIT_MAX_PAGES:
 259             self._number_pdf(pdf, **kwargs)
 260         else:
 261             # section_size must be even
 262             sections = pages // PDFEDIT_MAX_PAGES + 1
 263             section_size = (pages // sections + 2) & ~1
 264
 265             pdf_sections = []
 266             s = kwargs.pop('number_start', 1)
 267             while s < pages:
 268                 e = s + section_size - 1
 269                 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
 270                 if e < pages - 1:
 271                     page_range = '%s-%s' % (s, e)
 272                 else:
 273                     page_range = '%s-end' % s
 274                 run(['pdftk',
 275                      pdf,
 276                      'cat',
 277                      page_range,
 278                      'output',
 279                      pdf_section,
 280                      ])
 281                 self._number_pdf(pdf_section, number_start=s, **kwargs)
 282                 pdf_sections.append(pdf_section)
 283                 s = e + 1
 284
 285             concat_pdfs(pdf, *pdf_sections)
 286
 287     def make_barcode_pdf(self, isbn, pdf, corner='br'):
 288         """Put ann ISBN barcode in a corner of a single blank page."""
 289
 290         position = '%s,%s,%s,%s,%s' %(corner, self.width, self.height, self.side_margin, self.bottom_margin)
 291         cmd1 = [config.BOOKLAND,
 292                 '--position', position,
 293                 str(isbn)]
 294         cmd2 = ['ps2pdf',
 295                 '-dFIXEDMEDIA',
 296                 '-dDEVICEWIDTHPOINTS=%s' % self.width,
 297                 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
 298                 '-', pdf]
 299
 300         p1 = Popen(cmd1, stdout=PIPE)
 301         p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
 302         out, err = p2.communicate()
 303
 304         log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
 305         log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
 306
 307
 308
 309
 310 def concat_pdfs(name, *args):
 311     """Join all the named pdfs together into one and save it as <name>"""
 312     cmd = ['pdftk']
 313     cmd.extend(x for x in args if x is not None)
 314     cmd += ['cat', 'output', name]
 315     run(cmd)
 316
 317 def index_pdf(pdf, text=None):
 318     """Use pdftotext to extract utf-8 text from a pdf, using ^L to
 319     separate pages."""
 320     if text is None:
 321         text = pdf + '.index.txt'
 322     cmd = ['pdftotext',
 323            #'-layout', #keeps more original formatting
 324            pdf,
 325            text]
 326     run(cmd)
 327     return text
 328
 329 def rotate_pdf(pdfin, pdfout):
 330     """Turn the PDF on its head"""
 331     cmd = ['pdftk', pdfin,
 332            'cat',
 333            '1-endD',
 334            'output',
 335            pdfout
 336            ]
 337     run(cmd)
 338
 339
 340 class Book(object):
 341     page_numbers = 'latin'
 342     preamble_page_numbers = 'roman'
 343     engine= 'webkit'
 344     _try_cleanup_on_del = True
 345
 346     def notify_watcher(self, message=None):
 347         if self.watcher:
 348             if  message is None:
 349                 #message is the name of the caller
 350                 #XXX look at using inspect module
 351                 import traceback
 352                 message = traceback.extract_stack(None, 2)[0][2]
 353             log("notify_watcher called with '%s'" % message)
 354             self.watcher(message)
 355
 356     def __enter__(self):
 357         return self
 358
 359     def __exit__(self, exc_type, exc_value, traceback):
 360         self.cleanup()
 361         #could deal with exceptions here and return true
 362
 363     def __init__(self, book, server, bookname,
 364                  page_settings=None, engine=None, watcher=None, isbn=None,
 365                  license=config.DEFAULT_LICENSE):
 366         log("*** Starting new book %s ***" % bookname)
 367         self.book = book
 368         self.server = server
 369         self.watcher = watcher
 370         self.isbn = isbn
 371         self.license = license
 372         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 373         os.chmod(self.workdir, 0755)
 374         defaults = SERVER_DEFAULTS.get(server, SERVER_DEFAULTS[DEFAULT_SERVER])
 375         self.default_css = defaults['css']
 376         self.lang = defaults['lang']
 377         self.dir  = defaults['dir']
 378
 379         self.body_html_file = self.filepath('body.html')
 380         self.body_pdf_file = self.filepath('body.pdf')
 381         self.body_index_file = self.filepath('body.txt')
 382         self.preamble_html_file = self.filepath('preamble.html')
 383         self.preamble_pdf_file = self.filepath('preamble.pdf')
 384         self.tail_html_file = self.filepath('tail.html')
 385         self.tail_pdf_file = self.filepath('tail.pdf')
 386         self.isbn_pdf_file = None
 387         self.pdf_file = self.filepath('final.pdf')
 388
 389         self.publish_name = bookname
 390         self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
 391         self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
 392
 393         self.book_url = config.BOOK_URL % (self.server, self.book)
 394         self.toc_url = config.TOC_URL % (self.server, self.book)
 395
 396         self.set_page_dimensions(page_settings)
 397
 398         if engine is not None:
 399             self.engine = engine
 400         self.notify_watcher()
 401
 402     def __del__(self):
 403         if os.path.exists(self.workdir) and self._try_cleanup_on_del:
 404             self._try_cleanup_on_del = False #or else you can get in bad cycles
 405             self.cleanup()
 406
 407     def __getattr__(self, attr):
 408         """catch unloaded books and load them"""
 409         #log('looking for missing attribute "%s"' % (attr))
 410         if attr == 'tree':
 411             self.load_book()
 412             return self.tree
 413         if attr == 'toc':
 414             self.load_toc()
 415             return self.toc
 416         raise AttributeError("no such member: '%s'" % attr)
 417
 418
 419     def filepath(self, fn):
 420         return os.path.join(self.workdir, fn)
 421
 422     def save_data(self, fn, data):
 423         """Save without tripping up on unicode"""
 424         if isinstance(data, unicode):
 425             data = data.encode('utf8', 'ignore')
 426         f = open(fn, 'w')
 427         f.write(data)
 428         f.close()
 429
 430     def save_tempfile(self, fn, data):
 431         """Save the data in a temporary directory that will be cleaned
 432         up when all is done.  Return the absolute file path."""
 433         fn = self.filepath(fn)
 434         self.save_data(fn, data)
 435         return fn
 436
 437     def set_page_dimensions(self, dimensions):
 438         self.maker = PageSettings(**dimensions)
 439
 440
 441     def extract_pdf_text(self):
 442         """Extract the text from the body pdf, split into pages, so
 443         that the correct page can be found to generate the table of
 444         contents."""
 445         index_pdf(self.body_pdf_file, self.body_index_file)
 446         f = open(self.body_index_file)
 447         s = unicode(f.read(), 'utf8')
 448         f.close()
 449         #pages are spearated by formfeed character "^L", "\f" or chr(12)
 450         self.text_pages = s.split("\f")
 451         #there is sometimes (probably always) an unwanted ^L at the end
 452         return len(self.text_pages)
 453
 454     def make_body_pdf(self):
 455         """Make a pdf of the HTML, using webkit"""
 456         #1. Save the html
 457         html_text = lxml.etree.tostring(self.tree, method="html")
 458         self.save_data(self.body_html_file, html_text)
 459
 460         #2. Make a pdf of it
 461         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file,
 462                                 engine=self.engine)
 463         self.notify_watcher('generate_pdf')
 464
 465         #3. extract the text for finding contents.
 466         n_pages = self.extract_pdf_text()
 467         log ("found %s pages in pdf" % n_pages)
 468         #4. resize pages, shift gutters, and rotate 180 degrees for RTL
 469         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 470         self.notify_watcher('reshape_pdf')
 471
 472         #5 add page numbers
 473         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 474                               numbers=self.page_numbers)
 475         self.notify_watcher("number_pdf")
 476         self.notify_watcher()
 477
 478     def make_preamble_pdf(self):
 479         contents = self.make_contents()
 480         inside_cover_html = self.compose_inside_cover()
 481         html = ('<html dir="%s"><head>\n'
 482                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 483                 '<link rel="stylesheet" href="%s" />\n'
 484                 '</head>\n<body>\n'
 485                 '<h1 class="frontpage">%s</h1>'
 486                 '%s\n'
 487                 '<div class="contents">%s</div>\n'
 488                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 489                 '<!--%s--></div></body></html>'
 490                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 491                      contents, self.title)
 492         self.save_data(self.preamble_html_file, html)
 493
 494         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file,
 495                                 engine=self.engine)
 496
 497         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 498
 499         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 500                             numbers=self.preamble_page_numbers,
 501                             number_start=-2)
 502
 503         self.notify_watcher()
 504
 505     def make_pdf(self):
 506         """A convenient wrapper of a few necessary steps"""
 507         # now the Xvfb server is needed. make sure it has had long enough to get going
 508         self.wait_for_xvfb()
 509         self.make_body_pdf()
 510         self.make_preamble_pdf()
 511         self.make_end_matter_pdf()
 512
 513         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 514                     self.body_pdf_file, self.tail_pdf_file,
 515                     self.isbn_pdf_file)
 516
 517         self.notify_watcher('concatenated_pdfs')
 518         #and move it into place (what place?)
 519
 520     def rotate180(self):
 521         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 522         presses."""
 523         rotated = self.filepath('final-rotate.pdf')
 524         unrotated = self.filepath('final-pre-rotate.pdf')
 525         #leave the unrotated pdf intact at first, in case of error.
 526         rotate_pdf(self.pdf_file, rotated)
 527         os.rename(self.pdf_file, unrotated)
 528         os.rename(rotated, self.pdf_file)
 529         self.notify_watcher()
 530
 531     def publish_pdf(self):
 532         """Move the finished PDF to its final resting place"""
 533         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 534         os.rename(self.pdf_file, self.publish_file)
 535         self.notify_watcher()
 536
 537     def load_toc(self):
 538         """From the TOC.txt file create a list of TocItems with
 539         the attributes <status>, <chapter>, and <title>.
 540
 541         <status> is a number, with the following meaning:
 542
 543               0 - section heading with no chapter
 544               1 - chapter heading
 545               2 - book title
 546
 547         The TocItem object has convenience functions <is_chapter> and
 548         <is_section>.
 549
 550         <chapter> is twiki name of the chapter.
 551
 552         <title> is a human readable title for the chapter.  It is likely to
 553         differ from the title given in the chapter's <h1> heading.
 554         """
 555         f = urlopen(self.toc_url)
 556         self.toc = []
 557         while True:
 558             try:
 559                 self.toc.append(TocItem(f.next().strip(),
 560                                         f.next().strip(),
 561                                         f.next().strip()))
 562             except StopIteration:
 563                 break
 564         f.close()
 565         self.notify_watcher()
 566
 567     def load_book(self, tidy=True):
 568         """Fetch and parse the raw html of the book.  If tidy is true
 569         (default) links in the document will be made absolute."""
 570         f = urlopen(self.book_url)
 571         html = f.read()
 572         f.close()
 573         html = ('<html dir="%s"><head>\n<title>%s</title>\n'
 574                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 575                 '</head>\n<body>\n'
 576                 '%s\n'
 577                 '<div style="page-break-before: always; color:#fff;" class="unseen">'
 578                 'A FLOSSManuals book</div>\n</body></html>'
 579                 ) % (self.dir, self.book, html)
 580
 581         self.save_tempfile('raw.html', html)
 582
 583         tree = lxml.html.document_fromstring(html)
 584         if tidy:
 585             tree.make_links_absolute(self.book_url)
 586         self.tree = tree
 587         self.headings = [x for x in tree.cssselect('h1')]
 588         if self.headings:
 589             self.headings[0].set('class', "first-heading")
 590         #self.heading_texts = [x.textcontent() for x in self.headings]
 591         for h1 in self.headings:
 592             h1.title = h1.text_content().strip()
 593         self.notify_watcher()
 594
 595
 596     def load(self):
 597         """Wrapper around all necessary load methods."""
 598         self.load_book()
 599         self.load_toc()
 600
 601     def find_page(self, element, start_page=1):
 602         """Search through a page iterator and return the page
 603         number which the element probably occurs."""
 604         text = element.cookie
 605         for i, content in enumerate(self.text_pages[start_page - 1:]):
 606             log("looking for '%s' in page %s below:\n%s[...]" %
 607                 (text, i + start_page, content[:160]), debug='INDEX')
 608             #remove spaces: they can appear spuriously
 609             content = ''.join(content.split())
 610             if text in content:
 611                 return i + start_page, True
 612         #If it isn't found, return the start page so the next chapter has a chance
 613         return start_page, False
 614
 615     def make_contents(self):
 616         """Generate HTML containing the table of contents.  This can
 617         only be done after the main PDF has been made."""
 618         header = '<h1>Table of Contents</h1><table class="toc">\n'
 619         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 620                     '<td class="pagenumber">%s</td></tr>\n')
 621         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 622         footer = '\n</table>'
 623
 624         contents = []
 625
 626         chapter = 1
 627         page_num = 1
 628         subsections = [] # for the subsection heading pages.
 629
 630         headings = iter(self.headings)
 631
 632         for t in self.toc:
 633             if t.is_chapter():
 634                 try:
 635                     h1 = headings.next()
 636                 except StopIteration:
 637                     log("heading not found for %s (previous h1 missing?). Stopping" % t)
 638                     break
 639                 page_num, found = self.find_page(h1, page_num)
 640                 # sometimes the heading isn't found, which is shown as a frown
 641                 if found:
 642                     contents.append(row_tmpl % (chapter, h1.title, page_num))
 643                 else:
 644                     contents.append(row_tmpl % (chapter, h1.title, ':-('))
 645                 chapter += 1
 646             elif t.is_section():
 647                 contents.append(section_tmpl % t.title)
 648             else:
 649                 log("mystery TOC item: %s" % t)
 650
 651         doc = header + '\n'.join(contents) + footer
 652         self.notify_watcher()
 653         return doc
 654
 655     def add_section_titles(self):
 656         """Add any section heading pages that the TOC.txt file
 657         specifies.  These are sub-book, super-chapter groupings.
 658
 659         Also add initial numbers to chapters.
 660         """
 661         log(self.headings)
 662         headings = iter(self.headings)
 663         chapter = 1
 664         section = None
 665
 666         for t in self.toc:
 667             if t.is_chapter() and section is not None:
 668                 try:
 669                     h1 = headings.next()
 670                 except StopIteration:
 671                     log("heading not found for %s (previous h1 missing?)" % t)
 672                     break
 673                 item = h1.makeelement('div', Class='chapter')
 674                 log(h1.title, debug='HTMLGEN')
 675                 item.text = h1.title
 676                 _add_initial_number(item, chapter)
 677
 678                 section.append(item)
 679
 680                 if not section_placed:
 681                     log("placing section", debug='HTMLGEN')
 682                     h1.addprevious(section)
 683                     section_placed = True
 684                 else:
 685                     log("NOT placing section", debug='HTMLGEN')
 686
 687                 #put a bold number at the beginning of the h1, and a hidden cookie at the end.
 688                 _add_initial_number(h1, chapter)
 689                 _add_chapter_cookie(h1)
 690                 chapter += 1
 691
 692             elif t.is_section():
 693                 section = self.tree.makeelement('div', Class="subsection")
 694                 # section Element complains when you try to ask it whether it
 695                 # has been placed (though it does know)
 696                 section_placed = False
 697                 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
 698                 heading.set("Class", "subsection-heading")
 699                 section.append(heading)
 700
 701         self.notify_watcher()
 702
 703
 704     def add_css(self, css=None):
 705         """If css looks like a url, use it as a stylesheet link.
 706         Otherwise it is the CSS itself, which is saved to a temporary file
 707         and linked to."""
 708         log("css is %r" % css)
 709         htmltree = self.tree
 710         if css is None or not css.strip():
 711             url = 'file://' + os.path.abspath(self.default_css)
 712         elif not re.match(r'^http://\S+$', css):
 713             fn = self.save_tempfile('objavi.css', css)
 714             url = 'file://' + fn
 715         else:
 716             url = css
 717         #XXX for debugging and perhaps sensible anyway
 718         #url = url.replace('file:///home/douglas/objavi2', '')
 719
 720
 721         #find the head -- it's probably first child but lets not assume.
 722         for child in htmltree:
 723             if child.tag == 'head':
 724                 head = child
 725                 break
 726         else:
 727             head = htmltree.makeelement('head')
 728             htmltree.insert(0, head)
 729
 730         link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 731         self.css_url = url
 732         self.notify_watcher()
 733         return url
 734
 735     def set_title(self, title=None):
 736         """If a string is supplied, it becomes the book's title.
 737         Otherwise a guess is made."""
 738         if title:
 739             self.title = title
 740         else:
 741             titles = [x.text_content() for x in self.tree.cssselect('title')]
 742             if titles and titles[0]:
 743                 self.title = titles[0]
 744             else:
 745                 #oh well
 746                 self.title = 'A Manual About ' + self.book
 747         return self.title
 748
 749     def _read_localised_template(self, template, fallbacks=['en']):
 750         """Try to get the template in the approriate language, otherwise in english."""
 751         for lang in [self.lang] + fallbacks:
 752             try:
 753                 fn = template % (lang)
 754                 f = open(fn)
 755                 break
 756             except IOError, e:
 757                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 758                 log(e)
 759         template = f.read()
 760         f.close()
 761         return template
 762
 763     def compose_inside_cover(self):
 764         """create the markup for the preamble inside cover."""
 765         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 766
 767         if self.isbn:
 768             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 769         else:
 770             isbn_text = ''
 771
 772         return template % {'date': time.strftime('%Y-%m-%d'),
 773                            'isbn': isbn_text,
 774                            'license': self.license,
 775                            }
 776
 777
 778     def compose_end_matter(self):
 779         """create the markup for the end_matter inside cover.  If
 780         self.isbn is not set, the html will result in a pdf that
 781         spills onto two pages.
 782         """
 783         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 784
 785         d = {'css_url': self.css_url,
 786              'title': self.title
 787              }
 788
 789         if self.isbn:
 790             d['inside_cover_style'] = ''
 791         else:
 792             d['inside_cover_style'] = 'page-break-after: always'
 793
 794         return template % d
 795
 796
 797
 798
 799     def spawn_x(self):
 800         """Start an Xvfb instance, using a new server number.  A
 801         reference to it is stored in self.xvfb, which is used to kill
 802         it when the pdf is done.
 803
 804         Note that Xvfb doesn't interact well with dbus which is
 805         present on modern desktops.
 806         """
 807         #Find an unused server number (in case two cgis are running at once)
 808         while True:
 809             servernum = random.randrange(50, 500)
 810             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 811                 break
 812
 813         self.xserver_no = ':%s' % servernum
 814
 815         authfile = self.filepath('Xauthority')
 816         os.environ['XAUTHORITY'] = authfile
 817
 818         #mcookie(1) eats into /dev/random, so avoid that
 819         from hashlib import md5
 820         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 821         mcookie = m.hexdigest()
 822
 823         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 824
 825         self.xvfb = Popen(['Xvfb', self.xserver_no,
 826                            '-screen', '0', '1024x768x24',
 827                            '-pixdepths', '32',
 828                            #'-blackpixel', '0',
 829                            #'-whitepixel', str(2 ** 24 -1),
 830                            #'+extension', 'Composite',
 831                            '-dpi', '96',
 832                            '-kb',
 833                            '-nolisten', 'tcp',
 834                            ])
 835
 836         # We need to wait a bit before the Xvfb is ready.  but the
 837         # downloads are so slow that that probably doesn't matter
 838
 839         self.xvfb_ready_time = time.time() + 2
 840
 841         os.environ['DISPLAY'] = self.xserver_no
 842         log(self.xserver_no)
 843
 844     def wait_for_xvfb(self):
 845         """wait until a previously set time before continuing.  This
 846         is so Xvfb has time to properly start."""
 847         if hasattr(self, 'xvfb'):
 848             d = self.xvfb_ready_time - time.time()
 849             if d > 0:
 850                 time.sleep(d)
 851                 self.notify_watcher()
 852
 853     def cleanup_x(self):
 854         """Try very hard to kill off Xvfb.  In addition to killing
 855         this instance's xvfb, occasionally (randomly) search for
 856         escaped Xvfb instances and kill those too."""
 857         if not hasattr(self, 'xvfb'):
 858             return
 859         check_call(['xauth', 'remove', self.xserver_no])
 860         p = self.xvfb
 861         log("trying to kill Xvfb %s" % p.pid)
 862         os.kill(p.pid, 15)
 863         for i in range(10):
 864             if p.poll() is not None:
 865                 log("%s died with %s" % (p.pid, p.poll()))
 866                 break
 867             log("%s not dead yet" % p.pid)
 868             time.sleep(0.2)
 869         else:
 870             log("Xvfb would not die! kill -9! kill -9!")
 871             os.kill(p.pid, 9)
 872
 873         if random.random() < 0.05:
 874             #kill old xvfbs occasionally, if there are any.
 875             self.kill_old_xvfbs()
 876
 877     def kill_old_xvfbs(self):
 878         """Sometimes, despite everything, Xvfb instances hang around
 879         well after they are wanted -- for example if the cgi process
 880         dies particularly badly. So kill them if they have been
 881         running for a long time."""
 882         log("running kill_old_xvfbs")
 883         p = Popen(['ps', '-C' 'Xvfb', '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 884         data = p.communicate()[0].strip()
 885         if data:
 886             lines = data.split('\n')
 887             for line in lines:
 888                 log('dealing with ps output "%s"' % line)
 889                 try:
 890                     pid, days_, hours, minutes, seconds = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$').groups()
 891                 except AttributeError:
 892                     log("Couldn't parse that line!")
 893                 # 50 minutes should be enough xvfb time for anyone
 894                 if days or hours or int(minutes) > 50:
 895                     log("going to kill pid %s" % pid)
 896                     os.kill(int(pid), 15)
 897                     time.sleep(0.5)
 898                     os.kill(int(pid), 9)
 899         self.notify_watcher()
 900
 901     def cleanup(self):
 902         self.cleanup_x()
 903         if not config.KEEP_TEMP_FILES:
 904             for fn in os.listdir(self.workdir):
 905                 os.remove(os.path.join(self.workdir, fn))
 906             os.rmdir(self.workdir)
 907         else:
 908             log("NOT removing '%s', containing the following files:" % self.workdir)
 909             log(*os.listdir(self.workdir))
 910
 911         self.notify_watcher()
 912
 913