fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This provides abstractions of texts and virtual printers and manages
   3 # their interactions.
   4 #
   5 # Copyright (C) 2009 Douglas Bagnall
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with this program; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20
  21 """Library module representing a complete FM book being turned into a
  22 PDF"""
  23
  24 import os, sys
  25 import tempfile
  26 import re, time
  27 import random
  28 from urllib2 import urlopen
  29 from subprocess import Popen, check_call, PIPE
  30
  31 import lxml.etree, lxml.html
  32 import lxml, lxml.html, lxml.etree
  33
  34 import config
  35 from config import SERVER_DEFAULTS, DEFAULT_SERVER, POINT_2_MM, PDFEDIT_MAX_PAGES
  36
  37 TMPDIR = os.path.abspath(config.TMPDIR)
  38 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  39 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  40
  41
  42 def log(*messages, **kwargs):
  43     """Send the messages to the appropriate place (stderr, or syslog).
  44     If a <debug> keyword is specified, the message is only printed if
  45     its value ias in the global DEBUG_MODES."""
  46     if 'debug' not in kwargs or config.DEBUG_ALL or kwargs['debug'] in config.DEBUG_MODES:
  47         for m in messages:
  48             try:
  49                 print >> sys.stderr, m
  50             except Exception:
  51                 print >> sys.stderr, repr(m)
  52
  53 def _add_initial_number(e, n):
  54     """Put a styled chapter number n at the beginning of element e."""
  55     initial = e.makeelement("strong", Class="initial")
  56     e.insert(0, initial)
  57     initial.tail = ' '
  58     if e.text is not None:
  59         initial.tail += e.text
  60     e.text = ''
  61     initial.text = "%s." % n
  62
  63 def _add_chapter_cookie(e):
  64     """add magic hidden text to help with contents generation"""
  65     cookie = e.makeelement("span", Class="heading-cookie", dir="ltr",
  66                            style="font-size:6pt; line-height: 6pt; color: #fff; width:0;"
  67                            " float:left; margin:-2em; z-index: -67; display: block;"
  68                            )
  69     cookie.text = ''.join(random.choice(config.CHAPTER_COOKIE_CHARS) for x in range(8))
  70     e.cookie = cookie.text
  71     e.addnext(cookie)
  72     #e.append(cookie)
  73
  74
  75 class TocItem(object):
  76     """This makes sense of the tuples from TOC.txt files"""
  77     def __init__(self, status, chapter, title):
  78         # status is
  79         #  0 - section heading with no chapter
  80         #  1 - chapter heading
  81         #  2 - book title
  82         #
  83         # chapter is twiki name of the chapter
  84         # title is a human readable name of the chapter.
  85         self.status = status
  86         self.chapter = chapter
  87         self.title = title
  88
  89     def is_chapter(self):
  90         return self.status == '1'
  91
  92     def is_section(self):
  93         return self.status == '0'
  94
  95     def __str__(self):
  96         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
  97
  98
  99 def run(cmd):
 100     try:
 101         p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 102         out, err = p.communicate()
 103     except Exception:
 104         log("Failed on command: %r" % cmd)
 105         raise
 106     log("%s\n%s returned %s and produced\nstdout:%s\nstderr:%s" %
 107         (' '.join(cmd), cmd[0], p.poll(), out, err))
 108
 109
 110 def find_containing_paper(w, h):
 111     size = None
 112     for name, pw, ph in config.PAPER_SIZES:
 113         if pw >= w and ph >= h:
 114             mw = (pw - w) * 0.5
 115             mh = (ph - h) * 0.5
 116             return (name, mw, mh)
 117
 118     raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
 119                      (w * POINT_2_MM, h * POINT_2_MM))
 120
 121
 122
 123 class PageSettings(object):
 124     """Calculates and wraps commands for the generation and processing
 125     of PDFs"""
 126     def __init__(self, pointsize, **kwargs):
 127         # the formulas for default gutters, margins and column margins
 128         # are quite ad-hoc and certainly improvable.
 129
 130         self.width, self.height = pointsize
 131         self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
 132
 133         self.gutter = kwargs.get('gutter', (config.BASE_GUTTER +
 134                                             config.PROPORTIONAL_GUTTER * self.width))
 135
 136         default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
 137         self.top_margin = kwargs.get('top_margin', default_margin)
 138         self.side_margin = kwargs.get('top_margin', default_margin)
 139         self.bottom_margin = kwargs.get('top_margin', default_margin)
 140         self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
 141         self.columns = kwargs.get('columns', 1)
 142
 143         self.column_margin = kwargs.get('column_margin', default_margin * 2 / (4.0 + self.columns))
 144
 145         self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
 146         self.number_margin = self.side_margin
 147
 148         # calculate margins in mm for browsers
 149         self.margins = []
 150         for m, clip in ((self.top_margin, clipy),
 151                         (self.side_margin, clipx + 0.5 * self.gutter),
 152                         (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
 153                         (self.side_margin, clipx + 0.5 * self.gutter),
 154                         ):
 155             if m is None:
 156                 m = default_margin
 157             self.margins.append((m + clip) * POINT_2_MM)
 158
 159         for x in locals().iteritems():
 160             log("%s: %s" % x, debug='PDFGEN')
 161         for x in dir(self):
 162             log("%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
 163
 164
 165
 166     def _webkit_command(self, html, pdf):
 167         m = [str(x) for x in self.margins]
 168         cmd = [config.WKHTMLTOPDF, '-q', '-s', self.papersize,
 169                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
 170                ] + config.WKHTMLTOPDF_EXTRA_COMMANDS + [
 171                html, pdf]
 172         log(' '.join(cmd))
 173         return cmd
 174
 175     def _gecko_command(self, html, pdf):
 176         m = [str(x) for x in self.margins]
 177         #firefox -P pdfprint -print URL -printprinter "printer_settings"
 178         cmd = [FIREFOX, '-P', 'pdfprint', '-print',
 179                html, '-printprinter', self.moz_printer]
 180         log(' '.join(cmd))
 181         return cmd
 182
 183     def make_raw_pdf(self, html, pdf, engine='webkit'):
 184         func = getattr(self, '_%s_command' % engine)
 185         if self.columns == 1:
 186             cmd = func(html, pdf)
 187             run(cmd)
 188         else:
 189             printable_width = self.width - 2.0 * self.side_margin - self.gutter
 190             column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
 191             page_width = column_width + self.column_margin
 192
 193             columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
 194                                        gutter=0, top_margin=self.top_margin,
 195                                        side_margin=self.column_margin * 0.5,
 196                                        bottom_margin=self.bottom_margin)
 197
 198             column_pdf = pdf[:-4] + '-single-column.pdf'
 199             columnmaker.make_raw_pdf(html, column_pdf, engine=engine)
 200             columnmaker.reshape_pdf(column_pdf)
 201
 202             cmd = ['pdfnup',
 203                    '--nup', '%sx1' % int(self.columns),
 204                    '--paper', self.papersize.lower() + 'paper',
 205                    '--outfile', pdf,
 206                    '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
 207                    '--noautoscale', 'true',
 208                    '--orient', 'portrait',
 209                    #'--tidy', 'false',
 210                    column_pdf
 211                    ]
 212             run(cmd)
 213
 214
 215
 216     def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
 217                     even_pages=True):
 218         """Spin the pdf for RTL text, resize it to the right size, and
 219         shift the gutter left and right"""
 220         ops = 'resize'
 221         if self.gutter:
 222             ops += ',shift'
 223         if even_pages:
 224             ops += ',even_pages'
 225         gutter = self.gutter
 226         if dir == 'RTL':
 227             gutter = -gutter
 228         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 229                'dir=%s' % dir,
 230                'filename=%s' % pdf,
 231                'output_filename=%s' % pdf,
 232                'operation=%s' % ops,
 233                'width=%s' % self.width,
 234                'height=%s' % self.height,
 235                'offset=%s' % gutter,
 236                'centre_start=%s' % centre_start,
 237                'centre_end=%s' % centre_end,
 238                ]
 239         run(cmd)
 240
 241     def _number_pdf(self, pdf, numbers='latin', dir='LTR',
 242                     number_start=1):
 243         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 244                'operation=page_numbers',
 245                'dir=%s' % dir,
 246                'filename=%s' % pdf,
 247                'output_filename=%s' % pdf,
 248                'number_start=%s' % number_start,
 249                'number_style=%s' % numbers,
 250                'number_bottom=%s' % self.number_bottom,
 251                'number_margin=%s' % self.number_margin,
 252                ]
 253         run(cmd)
 254
 255     def number_pdf(self, pdf, pages, **kwargs):
 256         # if there are too many pages for pdfedit to handle in one go,
 257         # split the job into bits.  <pages> may not be exact
 258         if pages is None or pages <= PDFEDIT_MAX_PAGES:
 259             self._number_pdf(pdf, **kwargs)
 260         else:
 261             # section_size must be even
 262             sections = pages // PDFEDIT_MAX_PAGES + 1
 263             section_size = (pages // sections + 2) & ~1
 264
 265             pdf_sections = []
 266             s = kwargs.pop('number_start', 1)
 267             while s < pages:
 268                 e = s + section_size - 1
 269                 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
 270                 if e < pages - 1:
 271                     page_range = '%s-%s' % (s, e)
 272                 else:
 273                     page_range = '%s-end' % s
 274                 run(['pdftk',
 275                      pdf,
 276                      'cat',
 277                      page_range,
 278                      'output',
 279                      pdf_section,
 280                      ])
 281                 self._number_pdf(pdf_section, number_start=s, **kwargs)
 282                 pdf_sections.append(pdf_section)
 283                 s = e + 1
 284
 285             concat_pdfs(pdf, *pdf_sections)
 286
 287     def make_barcode_pdf(self, isbn, pdf, corner='br'):
 288         """Put ann ISBN barcode in a corner of a single blank page."""
 289
 290         position = '%s,%s,%s,%s,%s' %(corner, self.width, self.height, self.side_margin, self.bottom_margin)
 291         cmd1 = [config.BOOKLAND,
 292                 '--position', position,
 293                 str(isbn)]
 294         cmd2 = ['ps2pdf',
 295                 '-dFIXEDMEDIA',
 296                 '-dDEVICEWIDTHPOINTS=%s' % self.width,
 297                 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
 298                 '-', pdf]
 299
 300         p1 = Popen(cmd1, stdout=PIPE)
 301         p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
 302         out, err = p2.communicate()
 303
 304         log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
 305         log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
 306
 307
 308
 309
 310 def concat_pdfs(name, *args):
 311     """Join all the named pdfs together into one and save it as <name>"""
 312     cmd = ['pdftk']
 313     cmd.extend(args)
 314     cmd += ['cat', 'output', name]
 315     run(cmd)
 316
 317 def index_pdf(pdf, text=None):
 318     """Use pdftotext to extract utf-8 text from a pdf, using ^L to
 319     separate pages."""
 320     if text is None:
 321         text = pdf + '.index.txt'
 322     cmd = ['pdftotext',
 323            #'-layout', #keeps more original formatting
 324            pdf,
 325            text]
 326     run(cmd)
 327     return text
 328
 329 def rotate_pdf(pdfin, pdfout):
 330     """Turn the PDF on its head"""
 331     cmd = ['pdftk', pdfin,
 332            'cat',
 333            '1-endD',
 334            'output',
 335            pdfout
 336            ]
 337     run(cmd)
 338
 339
 340 class Book(object):
 341     page_numbers = 'latin'
 342     preamble_page_numbers = 'roman'
 343     engine= 'webkit'
 344     _try_cleanup_on_del = True
 345
 346     def notify_watcher(self, message=None):
 347         if self.watcher:
 348             if  message is None:
 349                 #message is the name of the caller
 350                 #XXX look at using inspect module
 351                 import traceback
 352                 message = traceback.extract_stack(None, 2)[0][2]
 353                 log("notify_watcher called by '%s'" % message)
 354             self.watcher(message)
 355
 356     def __enter__(self):
 357         return self
 358
 359     def __exit__(self, exc_type, exc_value, traceback):
 360         self.cleanup()
 361         #could deal with exceptions here and return true
 362
 363     def __init__(self, book, server, bookname,
 364                  page_settings=None, engine=None, watcher=None):
 365         log("*** Starting new book %s ***" % bookname)
 366         self.book = book
 367         self.server = server
 368         self.watcher = watcher
 369         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 370         os.chmod(self.workdir, 0755)
 371         defaults = SERVER_DEFAULTS.get(server, SERVER_DEFAULTS[DEFAULT_SERVER])
 372         self.default_css = defaults['css']
 373         self.lang = defaults['lang']
 374         self.dir  = defaults['dir']
 375
 376         self.body_html_file = self.filepath('body.html')
 377         self.body_pdf_file = self.filepath('body.pdf')
 378         self.body_index_file = self.filepath('body.txt')
 379         self.preamble_html_file = self.filepath('preamble.html')
 380         self.preamble_pdf_file = self.filepath('preamble.pdf')
 381         self.pdf_file = self.filepath('final.pdf')
 382
 383         self.publish_name = bookname
 384         self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
 385         self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
 386
 387         self.book_url = config.BOOK_URL % (self.server, self.book)
 388         self.toc_url = config.TOC_URL % (self.server, self.book)
 389
 390         self.set_page_dimensions(page_settings)
 391
 392         if engine is not None:
 393             self.engine = engine
 394         self.notify_watcher()
 395
 396     def __del__(self):
 397         if os.path.exists(self.workdir) and self._try_cleanup_on_del:
 398             self._try_cleanup_on_del = False #or else you can get in bad cycles
 399             self.cleanup()
 400
 401     def __getattr__(self, attr):
 402         """catch unloaded books and load them"""
 403         #log('looking for missing attribute "%s"' % (attr))
 404         if attr == 'tree':
 405             self.load_book()
 406             return self.tree
 407         if attr == 'toc':
 408             self.load_toc()
 409             return self.toc
 410         raise AttributeError("no such member: '%s'" % attr)
 411
 412
 413     def filepath(self, fn):
 414         return os.path.join(self.workdir, fn)
 415
 416     def save_data(self, fn, data):
 417         """Save without tripping up on unicode"""
 418         if isinstance(data, unicode):
 419             data = data.encode('utf8', 'ignore')
 420         f = open(fn, 'w')
 421         f.write(data)
 422         f.close()
 423
 424     def save_tempfile(self, fn, data):
 425         """Save the data in a temporary directory that will be cleaned
 426         up when all is done.  Return the absolute file path."""
 427         fn = self.filepath(fn)
 428         self.save_data(fn, data)
 429         return fn
 430
 431     def set_page_dimensions(self, dimensions):
 432         self.maker = PageSettings(**dimensions)
 433
 434
 435     def extract_pdf_text(self):
 436         """Extract the text from the body pdf, split into pages, so
 437         that the correct page can be found to generate the table of
 438         contents."""
 439         index_pdf(self.body_pdf_file, self.body_index_file)
 440         f = open(self.body_index_file)
 441         s = unicode(f.read(), 'utf8')
 442         f.close()
 443         #pages are spearated by formfeed character "^L", "\f" or chr(12)
 444         self.text_pages = s.split("\f")
 445         #there is sometimes (probably always) an unwanted ^L at the end
 446         return len(self.text_pages)
 447
 448     def make_body_pdf(self):
 449         """Make a pdf of the HTML, using webkit"""
 450         #1. Save the html
 451         html_text = lxml.etree.tostring(self.tree, method="html")
 452         self.save_data(self.body_html_file, html_text)
 453
 454         #2. Make a pdf of it
 455         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file,
 456                                 engine=self.engine)
 457         self.notify_watcher('generate_pdf')
 458
 459         #3. extract the text for finding contents.
 460         n_pages = self.extract_pdf_text()
 461         log ("found %s pages in pdf" % n_pages)
 462         #4. resize pages, shift gutters, and rotate 180 degrees for RTL
 463         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 464         self.notify_watcher('reshape_pdf')
 465
 466         #5 add page numbers
 467         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 468                               numbers=self.page_numbers)
 469         self.notify_watcher("number_pdf")
 470         self.notify_watcher()
 471
 472     def make_preamble_pdf(self):
 473         contents = self.make_contents()
 474         html = ('<html dir="%s"><head>\n'
 475                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 476                 '<link rel="stylesheet" href="%s" />\n'
 477                 '</head>\n<body>\n'
 478                 '<h1 class="frontpage">%s</h1>'
 479                 '%s\n'
 480                 '<div class="contents">%s</div>\n'
 481                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 482                 '<!--%s--></div></body></html>'
 483                 ) % (self.dir, self.css_url, self.title, self.inside_cover_html,
 484                      contents, self.title)
 485         self.save_data(self.preamble_html_file, html)
 486
 487         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file,
 488                                 engine=self.engine)
 489
 490         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 491
 492         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 493                             numbers=self.preamble_page_numbers,
 494                             number_start=-2)
 495
 496         self.notify_watcher()
 497
 498     def make_pdf(self):
 499         """A convenient wrapper of a few necessary steps"""
 500         # now the Xvfb server is needed. make sure it has had long enough to get going
 501         self.wait_for_xvfb()
 502         self.make_body_pdf()
 503         self.make_preamble_pdf()
 504         concat_pdfs(self.pdf_file, self.preamble_pdf_file, self.body_pdf_file)
 505         self.notify_watcher('concatenated_pdfs')
 506         #and move it into place (what place?)
 507
 508     def rotate180(self):
 509         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 510         presses."""
 511         rotated = self.filepath('final-rotate.pdf')
 512         unrotated = self.filepath('final-pre-rotate.pdf')
 513         #leave the unrotated pdf intact at first, in case of error.
 514         rotate_pdf(self.pdf_file, rotated)
 515         os.rename(self.pdf_file, unrotated)
 516         os.rename(rotated, self.pdf_file)
 517         self.notify_watcher()
 518
 519     def publish_pdf(self):
 520         """Move the finished PDF to its final resting place"""
 521         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 522         os.rename(self.pdf_file, self.publish_file)
 523         self.notify_watcher()
 524
 525     def load_toc(self):
 526         """From the TOC.txt file create a list of TocItems with
 527         the attributes <status>, <chapter>, and <title>.
 528
 529         <status> is a number, with the following meaning:
 530
 531               0 - section heading with no chapter
 532               1 - chapter heading
 533               2 - book title
 534
 535         The TocItem object has convenience functions <is_chapter> and
 536         <is_section>.
 537
 538         <chapter> is twiki name of the chapter.
 539
 540         <title> is a human readable title for the chapter.  It is likely to
 541         differ from the title given in the chapter's <h1> heading.
 542         """
 543         f = urlopen(self.toc_url)
 544         self.toc = []
 545         while True:
 546             try:
 547                 self.toc.append(TocItem(f.next().strip(),
 548                                         f.next().strip(),
 549                                         f.next().strip()))
 550             except StopIteration:
 551                 break
 552         f.close()
 553         self.notify_watcher()
 554
 555     def load_book(self, tidy=True):
 556         """Fetch and parse the raw html of the book.  If tidy is true
 557         (default) links in the document will be made absolute."""
 558         f = urlopen(self.book_url)
 559         html = f.read()
 560         f.close()
 561         html = ('<html dir="%s"><head>\n<title>%s</title>\n'
 562                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 563                 '</head>\n<body>\n'
 564                 '%s\n'
 565                 '<div style="page-break-before: always; color:#fff;" class="unseen">'
 566                 'A FLOSSManuals book</div>\n</body></html>'
 567                 ) % (self.dir, self.book, html)
 568
 569         self.save_tempfile('raw.html', html)
 570
 571         tree = lxml.html.document_fromstring(html)
 572         if tidy:
 573             tree.make_links_absolute(self.book_url)
 574         self.tree = tree
 575         self.headings = [x for x in tree.cssselect('h1')]
 576         if self.headings:
 577             self.headings[0].set('class', "first-heading")
 578         #self.heading_texts = [x.textcontent() for x in self.headings]
 579         for h1 in self.headings:
 580             h1.title = h1.text_content().strip()
 581         self.notify_watcher()
 582
 583
 584     def load(self):
 585         """Wrapper around all necessary load methods."""
 586         self.load_book()
 587         self.load_toc()
 588
 589     def find_page(self, element, start_page=1):
 590         """Search through a page iterator and return the page
 591         number which the element probably occurs."""
 592         text = element.cookie
 593         for i, content in enumerate(self.text_pages[start_page - 1:]):
 594             log("looking for '%s' in page %s below:\n%s[...]" %
 595                 (text, i + start_page, content[:160]), debug='INDEX')
 596             #remove spaces: they can appear spuriously
 597             content = ''.join(content.split())
 598             if text in content:
 599                 return i + start_page, True
 600         #If it isn't found, return the start page so the next chapter has a chance
 601         return start_page, False
 602
 603     def make_contents(self):
 604         """Generate HTML containing the table of contents.  This can
 605         only be done after the main PDF has been made."""
 606         header = '<h1>Table of Contents</h1><table class="toc">\n'
 607         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 608                     '<td class="pagenumber">%s</td></tr>\n')
 609         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 610         footer = '\n</table>'
 611
 612         contents = []
 613
 614         chapter = 1
 615         page_num = 1
 616         subsections = [] # for the subsection heading pages.
 617
 618         headings = iter(self.headings)
 619
 620         for t in self.toc:
 621             if t.is_chapter():
 622                 try:
 623                     h1 = headings.next()
 624                 except StopIteration:
 625                     log("heading not found for %s (previous h1 missing?). Stopping" % t)
 626                     break
 627                 page_num, found = self.find_page(h1, page_num)
 628                 # sometimes the heading isn't found, which is shown as a frown
 629                 if found:
 630                     contents.append(row_tmpl % (chapter, h1.title, page_num))
 631                 else:
 632                     contents.append(row_tmpl % (chapter, h1.title, ':-('))
 633                 chapter += 1
 634             elif t.is_section():
 635                 contents.append(section_tmpl % t.title)
 636             else:
 637                 log("mystery TOC item: %s" % t)
 638
 639         doc = header + '\n'.join(contents) + footer
 640         self.notify_watcher()
 641         return doc
 642
 643     def add_section_titles(self):
 644         """Add any section heading pages that the TOC.txt file
 645         specifies.  These are sub-book, super-chapter groupings.
 646
 647         Also add initial numbers to chapters.
 648         """
 649         log(self.headings)
 650         headings = iter(self.headings)
 651         chapter = 1
 652         section = None
 653
 654         for t in self.toc:
 655             if t.is_chapter() and section is not None:
 656                 try:
 657                     h1 = headings.next()
 658                 except StopIteration:
 659                     log("heading not found for %s (previous h1 missing?)" % t)
 660                     break
 661                 item = h1.makeelement('div', Class='chapter')
 662                 log(h1.title, debug='HTMLGEN')
 663                 item.text = h1.title
 664                 _add_initial_number(item, chapter)
 665
 666                 section.append(item)
 667
 668                 if not section_placed:
 669                     log("placing section", debug='HTMLGEN')
 670                     h1.addprevious(section)
 671                     section_placed = True
 672                 else:
 673                     log("NOT placing section", debug='HTMLGEN')
 674
 675                 #put a bold number at the beginning of the h1, and a hidden cookie at the end.
 676                 _add_initial_number(h1, chapter)
 677                 _add_chapter_cookie(h1)
 678                 chapter += 1
 679
 680             elif t.is_section():
 681                 section = self.tree.makeelement('div', Class="subsection")
 682                 # section Element complains when you try to ask it whether it
 683                 # has been placed (though it does know)
 684                 section_placed = False
 685                 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
 686                 heading.set("Class", "subsection-heading")
 687                 section.append(heading)
 688
 689         self.notify_watcher()
 690
 691
 692     def add_css(self, css=None):
 693         """If css looks like a url, use it as a stylesheet link.
 694         Otherwise it is the CSS itself, which is saved to a temporary file
 695         and linked to."""
 696         log("css is %r" % css)
 697         htmltree = self.tree
 698         if css is None or not css.strip():
 699             url = 'file://' + os.path.abspath(self.default_css)
 700         elif not re.match(r'^http://\S+$', css):
 701             fn = self.save_tempfile('objavi.css', css)
 702             url = 'file://' + fn
 703         else:
 704             url = css
 705         #XXX for debugging and perhaps sensible anyway
 706         #url = url.replace('file:///home/douglas/objavi2', '')
 707
 708
 709         #find the head -- it's probably first child but lets not assume.
 710         for child in htmltree:
 711             if child.tag == 'head':
 712                 head = child
 713                 break
 714         else:
 715             head = htmltree.makeelement('head')
 716             htmltree.insert(0, head)
 717
 718         link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 719         self.css_url = url
 720         self.notify_watcher()
 721         return url
 722
 723     def set_title(self, title=None):
 724         """If a string is supplied, it becomes the book's title.
 725         Otherwise a guess is made."""
 726         if title:
 727             self.title = title
 728         else:
 729             titles = [x.text_content() for x in self.tree.cssselect('title')]
 730             if titles and titles[0]:
 731                 self.title = titles[0]
 732             else:
 733                 #oh well
 734                 self.title = 'A Manual About ' + self.book
 735         return self.title
 736
 737     def compose_inside_cover(self, license=config.DEFAULT_LICENSE, isbn=None):
 738         """create the markup for the preamble inside cover, storing it
 739         in self.inside_cover_html."""
 740         #XXX this should go in make_preamble_pdf, but that needs to be extracted from make_pdf
 741
 742         if isbn:
 743             isbn_text = '<b>ISBN :</b> %s <br>' % isbn
 744             #XXX make a barcode
 745         else:
 746             isbn_text = ''
 747
 748         for lang in (self.lang, 'en'):
 749             try:
 750                 fn = INSIDE_FRONT_COVER_TEMPLATE % (lang)
 751                 f = open(fn)
 752             except IOError, e:
 753                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 754                 log(e)
 755
 756         template = f.read()
 757         f.close()
 758
 759         self.inside_cover_html = template % {'date': time.strftime('%Y-%m-%d'),
 760                                              'isbn': isbn_text,
 761                                              'license': license,
 762                                              }
 763
 764
 765     def spawn_x(self):
 766         """Start an Xvfb instance, using a new server number.  A
 767         reference to it is stored in self.xvfb, which is used to kill
 768         it when the pdf is done.
 769
 770         Note that Xvfb doesn't interact well with dbus which is
 771         present on modern desktops.
 772         """
 773         #Find an unused server number (in case two cgis are running at once)
 774         while True:
 775             servernum = random.randrange(50, 500)
 776             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 777                 break
 778
 779         self.xserver_no = ':%s' % servernum
 780
 781         authfile = self.filepath('Xauthority')
 782         os.environ['XAUTHORITY'] = authfile
 783
 784         #mcookie(1) eats into /dev/random, so avoid that
 785         from hashlib import md5
 786         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 787         mcookie = m.hexdigest()
 788
 789         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 790
 791         self.xvfb = Popen(['Xvfb', self.xserver_no,
 792                            '-screen', '0', '1024x768x24',
 793                            '-pixdepths', '32',
 794                            #'-blackpixel', '0',
 795                            #'-whitepixel', str(2 ** 24 -1),
 796                            #'+extension', 'Composite',
 797                            '-dpi', '96',
 798                            '-kb',
 799                            '-nolisten', 'tcp',
 800                            ])
 801
 802         # We need to wait a bit before the Xvfb is ready.  but the
 803         # downloads are so slow that that probably doesn't matter
 804
 805         self.xvfb_ready_time = time.time() + 2
 806
 807         os.environ['DISPLAY'] = self.xserver_no
 808         log(self.xserver_no)
 809
 810     def wait_for_xvfb(self):
 811         """wait until a previously set time before continuing.  This
 812         is so Xvfb has time to properly start."""
 813         if hasattr(self, 'xvfb'):
 814             d = self.xvfb_ready_time - time.time()
 815             if d > 0:
 816                 time.sleep(d)
 817                 self.notify_watcher()
 818
 819     def cleanup_x(self):
 820         """Try very hard to kill off Xvfb.  In addition to killing
 821         this instance's xvfb, occasionally (randomly) search for
 822         escaped Xvfb instances and kill those too."""
 823         if not hasattr(self, 'xvfb'):
 824             return
 825         check_call(['xauth', 'remove', self.xserver_no])
 826         p = self.xvfb
 827         log("trying to kill Xvfb %s" % p.pid)
 828         os.kill(p.pid, 15)
 829         for i in range(10):
 830             if p.poll() is not None:
 831                 log("%s died with %s" % (p.pid, p.poll()))
 832                 break
 833             log("%s not dead yet" % p.pid)
 834             time.sleep(0.2)
 835         else:
 836             log("Xvfb would not die! kill -9! kill -9!")
 837             os.kill(p.pid, 9)
 838
 839         if random.random() < 0.05:
 840             #kill old xvfbs occasionally, if there are any.
 841             self.kill_old_xvfbs()
 842
 843     def kill_old_xvfbs(self):
 844         """Sometimes, despite everything, Xvfb instances hang around
 845         well after they are wanted -- for example if the cgi process
 846         dies particularly badly. So kill them if they have been
 847         running for a long time."""
 848         log("running kill_old_xvfbs")
 849         p = Popen(['ps', '-C' 'Xvfb', '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 850         data = p.communicate()[0].strip()
 851         if data:
 852             lines = data.split('\n')
 853             for line in lines:
 854                 log('dealing with ps output "%s"' % line)
 855                 try:
 856                     pid, days_, hours, minutes, seconds = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$').groups()
 857                 except AttributeError:
 858                     log("Couldn't parse that line!")
 859                 # 50 minutes should be enough xvfb time for anyone
 860                 if days or hours or int(minutes) > 50:
 861                     log("going to kill pid %s" % pid)
 862                     os.kill(int(pid), 15)
 863                     time.sleep(0.5)
 864                     os.kill(int(pid), 9)
 865         self.notify_watcher()
 866
 867     def cleanup(self):
 868         self.cleanup_x()
 869         if not config.KEEP_TEMP_FILES:
 870             for fn in os.listdir(self.workdir):
 871                 os.remove(os.path.join(self.workdir, fn))
 872             os.rmdir(self.workdir)
 873         else:
 874             log("NOT removing '%s', containing the following files:" % self.workdir)
 875             log(*os.listdir(self.workdir))
 876
 877         self.notify_watcher()
 878
 879