fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This provides abstractions of texts and virtual printers and manages
   3 # their interactions.
   4 #
   5 # Copyright (C) 2009 Douglas Bagnall
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with this program; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20
  21 """Library module representing a complete FM book being turned into a
  22 PDF"""
  23
  24 import os, sys
  25 import tempfile
  26 import re, time
  27 import random
  28 from urllib2 import urlopen
  29 from subprocess import Popen, check_call, PIPE
  30
  31 import lxml.etree, lxml.html
  32 import lxml, lxml.html, lxml.etree
  33
  34 import config
  35 from config import SERVER_DEFAULTS, DEFAULT_SERVER, POINT_2_MM, PDFEDIT_MAX_PAGES
  36
  37 TMPDIR = os.path.abspath(config.TMPDIR)
  38 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  39 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  40
  41
  42 def log(*messages, **kwargs):
  43     """Send the messages to the appropriate place (stderr, or syslog).
  44     If a <debug> keyword is specified, the message is only printed if
  45     its value ias in the global DEBUG_MODES."""
  46     if 'debug' not in kwargs or config.DEBUG_ALL or kwargs['debug'] in config.DEBUG_MODES:
  47         for m in messages:
  48             try:
  49                 print >> sys.stderr, m
  50             except Exception:
  51                 print >> sys.stderr, repr(m)
  52
  53 def _add_initial_number(e, n):
  54     """Put a styled chapter number n at the beginning of element e."""
  55     initial = e.makeelement("strong", Class="initial")
  56     e.insert(0, initial)
  57     initial.tail = ' '
  58     if e.text is not None:
  59         initial.tail += e.text
  60     e.text = ''
  61     initial.text = "%s." % n
  62
  63 def _add_chapter_cookie(e):
  64     """add magic hidden text to help with contents generation"""
  65     cookie = e.makeelement("span", Class="heading-cookie", dir="ltr",
  66                            style="font-size:6pt; line-height: 6pt; color: #fff; width:0;"
  67                            " float:left; margin:-2em; z-index: -67; display: block;"
  68                            )
  69     cookie.text = ''.join(random.choice(config.CHAPTER_COOKIE_CHARS) for x in range(8))
  70     e.cookie = cookie.text
  71     e.addnext(cookie)
  72     #e.append(cookie)
  73
  74
  75 class TocItem(object):
  76     """This makes sense of the tuples from TOC.txt files"""
  77     def __init__(self, status, chapter, title):
  78         # status is
  79         #  0 - section heading with no chapter
  80         #  1 - chapter heading
  81         #  2 - book title
  82         #
  83         # chapter is twiki name of the chapter
  84         # title is a human readable name of the chapter.
  85         self.status = status
  86         self.chapter = chapter
  87         self.title = title
  88
  89     def is_chapter(self):
  90         return self.status == '1'
  91
  92     def is_section(self):
  93         return self.status == '0'
  94
  95     def __str__(self):
  96         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
  97
  98
  99 def run(cmd):
 100     try:
 101         p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 102         out, err = p.communicate()
 103     except Exception:
 104         log("Failed on command: %r" % cmd)
 105         raise
 106     log("%s\n%s returned %s and produced\nstdout:%s\nstderr:%s" %
 107         (' '.join(cmd), cmd[0], p.poll(), out, err))
 108
 109
 110 def find_containing_paper(w, h):
 111     size = None
 112     for name, pw, ph in config.PAPER_SIZES:
 113         if pw >= w and ph >= h:
 114             mw = (pw - w) * 0.5
 115             mh = (ph - h) * 0.5
 116             return (name, mw, mh)
 117
 118     raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
 119                      (w * POINT_2_MM, h * POINT_2_MM))
 120
 121
 122
 123 class PageSettings(object):
 124     """Calculates and wraps commands for the generation and processing
 125     of PDFs"""
 126     def __init__(self, pointsize, **kwargs):
 127         # the formulas for default gutters, margins and column margins
 128         # are quite ad-hoc and certainly improvable.
 129
 130         self.width, self.height = pointsize
 131         self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
 132
 133         self.gutter = kwargs.get('gutter', (config.BASE_GUTTER +
 134                                             config.PROPORTIONAL_GUTTER * self.width))
 135
 136         default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
 137         self.top_margin = kwargs.get('top_margin', default_margin)
 138         self.side_margin = kwargs.get('top_margin', default_margin)
 139         self.bottom_margin = kwargs.get('top_margin', default_margin)
 140         self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
 141         self.columns = kwargs.get('columns', 1)
 142
 143         self.column_margin = kwargs.get('column_margin', default_margin * 2 / (4.0 + self.columns))
 144
 145         self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
 146         self.number_margin = self.side_margin
 147
 148         # calculate margins in mm for browsers
 149         self.margins = []
 150         for m, clip in ((self.top_margin, clipy),
 151                         (self.side_margin, clipx + 0.5 * self.gutter),
 152                         (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
 153                         (self.side_margin, clipx + 0.5 * self.gutter),
 154                         ):
 155             if m is None:
 156                 m = default_margin
 157             self.margins.append((m + clip) * POINT_2_MM)
 158
 159         for x in locals().iteritems():
 160             log("%s: %s" % x, debug='PDFGEN')
 161         for x in dir(self):
 162             log("%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
 163
 164
 165
 166     def _webkit_command(self, html, pdf):
 167         m = [str(x) for x in self.margins]
 168         cmd = [config.WKHTMLTOPDF, '-q', '-s', self.papersize,
 169                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
 170                ] + config.WKHTMLTOPDF_EXTRA_COMMANDS + [
 171                html, pdf]
 172         log(' '.join(cmd))
 173         return cmd
 174
 175     def _gecko_command(self, html, pdf):
 176         m = [str(x) for x in self.margins]
 177         #firefox -P pdfprint -print URL -printprinter "printer_settings"
 178         cmd = [FIREFOX, '-P', 'pdfprint', '-print',
 179                html, '-printprinter', self.moz_printer]
 180         log(' '.join(cmd))
 181         return cmd
 182
 183     def make_raw_pdf(self, html, pdf, engine='webkit'):
 184         func = getattr(self, '_%s_command' % engine)
 185         if self.columns == 1:
 186             cmd = func(html, pdf)
 187             run(cmd)
 188         else:
 189             printable_width = self.width - 2.0 * self.side_margin - self.gutter
 190             column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
 191             page_width = column_width + self.column_margin
 192
 193             columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
 194                                        gutter=0, top_margin=self.top_margin,
 195                                        side_margin=self.column_margin * 0.5,
 196                                        bottom_margin=self.bottom_margin)
 197
 198             column_pdf = pdf[:-4] + '-single-column.pdf'
 199             columnmaker.make_raw_pdf(html, column_pdf, engine=engine)
 200             columnmaker.reshape_pdf(column_pdf)
 201
 202             cmd = ['pdfnup',
 203                    '--nup', '%sx1' % int(self.columns),
 204                    '--paper', self.papersize.lower() + 'paper',
 205                    '--outfile', pdf,
 206                    '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
 207                    '--noautoscale', 'true',
 208                    '--orient', 'portrait',
 209                    #'--tidy', 'false',
 210                    column_pdf
 211                    ]
 212             run(cmd)
 213
 214
 215
 216     def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
 217                     even_pages=True):
 218         """Spin the pdf for RTL text, resize it to the right size, and
 219         shift the gutter left and right"""
 220         ops = 'resize'
 221         if self.gutter:
 222             ops += ',shift'
 223         if even_pages:
 224             ops += ',even_pages'
 225         gutter = self.gutter
 226         if dir == 'RTL':
 227             gutter = -gutter
 228         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 229                'dir=%s' % dir,
 230                'filename=%s' % pdf,
 231                'output_filename=%s' % pdf,
 232                'operation=%s' % ops,
 233                'width=%s' % self.width,
 234                'height=%s' % self.height,
 235                'offset=%s' % gutter,
 236                'centre_start=%s' % centre_start,
 237                'centre_end=%s' % centre_end,
 238                ]
 239         run(cmd)
 240
 241     def _number_pdf(self, pdf, numbers='latin', dir='LTR',
 242                     number_start=1):
 243         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 244                'operation=page_numbers',
 245                'dir=%s' % dir,
 246                'filename=%s' % pdf,
 247                'output_filename=%s' % pdf,
 248                'number_start=%s' % number_start,
 249                'number_style=%s' % numbers,
 250                'number_bottom=%s' % self.number_bottom,
 251                'number_margin=%s' % self.number_margin,
 252                ]
 253         run(cmd)
 254
 255     def number_pdf(self, pdf, pages, **kwargs):
 256         # if there are too many pages for pdfedit to handle in one go,
 257         # split the job into bits.  <pages> may not be exact
 258         if pages is None or pages <= PDFEDIT_MAX_PAGES:
 259             self._number_pdf(pdf, **kwargs)
 260         else:
 261             # section_size must be even
 262             sections = pages // PDFEDIT_MAX_PAGES + 1
 263             section_size = (pages // sections + 2) & ~1
 264
 265             pdf_sections = []
 266             s = kwargs.pop('number_start', 1)
 267             while s < pages:
 268                 e = s + section_size - 1
 269                 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
 270                 if e < pages - 1:
 271                     page_range = '%s-%s' % (s, e)
 272                 else:
 273                     page_range = '%s-end' % s
 274                 run(['pdftk',
 275                      pdf,
 276                      'cat',
 277                      page_range,
 278                      'output',
 279                      pdf_section,
 280                      ])
 281                 self._number_pdf(pdf_section, number_start=s, **kwargs)
 282                 pdf_sections.append(pdf_section)
 283                 s = e + 1
 284
 285             concat_pdfs(pdf, *pdf_sections)
 286
 287     def make_barcode_pdf(self, isbn, pdf, corner='br'):
 288         """Put ann ISBN barcode in a corner of a single blank page."""
 289
 290         position = '%s,%s,%s,%s,%s' %(corner, self.width, self.height, self.side_margin, self.bottom_margin)
 291         cmd1 = [config.BOOKLAND,
 292                 '--position', position,
 293                 str(isbn)]
 294         cmd2 = ['ps2pdf',
 295                 '-dFIXEDMEDIA',
 296                 '-dDEVICEWIDTHPOINTS=%s' % self.width,
 297                 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
 298                 '-', pdf]
 299
 300         p1 = Popen(cmd1, stdout=PIPE)
 301         p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
 302         out, err = p2.communicate()
 303
 304         log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
 305         log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
 306
 307
 308
 309
 310 def concat_pdfs(name, *args):
 311     """Join all the named pdfs together into one and save it as <name>"""
 312     cmd = ['pdftk']
 313     cmd.extend(x for x in args if x is not None)
 314     cmd += ['cat', 'output', name]
 315     run(cmd)
 316
 317 def index_pdf(pdf, text=None):
 318     """Use pdftotext to extract utf-8 text from a pdf, using ^L to
 319     separate pages."""
 320     if text is None:
 321         text = pdf + '.index.txt'
 322     cmd = ['pdftotext',
 323            #'-layout', #keeps more original formatting
 324            pdf,
 325            text]
 326     run(cmd)
 327     return text
 328
 329 def rotate_pdf(pdfin, pdfout):
 330     """Turn the PDF on its head"""
 331     cmd = ['pdftk', pdfin,
 332            'cat',
 333            '1-endD',
 334            'output',
 335            pdfout
 336            ]
 337     run(cmd)
 338
 339
 340 class Book(object):
 341     page_numbers = 'latin'
 342     preamble_page_numbers = 'roman'
 343     engine= 'webkit'
 344     _try_cleanup_on_del = True
 345
 346     def notify_watcher(self, message=None):
 347         if self.watcher:
 348             if  message is None:
 349                 #message is the name of the caller
 350                 #XXX look at using inspect module
 351                 import traceback
 352                 message = traceback.extract_stack(None, 2)[0][2]
 353             log("notify_watcher called with '%s'" % message)
 354             self.watcher(message)
 355
 356     def __enter__(self):
 357         return self
 358
 359     def __exit__(self, exc_type, exc_value, traceback):
 360         self.cleanup()
 361         #could deal with exceptions here and return true
 362
 363     def __init__(self, book, server, bookname,
 364                  page_settings=None, engine=None, watcher=None, isbn=None,
 365                  license=config.DEFAULT_LICENSE):
 366         log("*** Starting new book %s ***" % bookname)
 367         self.book = book
 368         self.server = server
 369         self.watcher = watcher
 370         self.isbn = isbn
 371         self.license = license
 372         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 373         os.chmod(self.workdir, 0755)
 374         defaults = SERVER_DEFAULTS.get(server, SERVER_DEFAULTS[DEFAULT_SERVER])
 375         self.default_css = defaults['css']
 376         self.lang = defaults['lang']
 377         self.dir  = defaults['dir']
 378
 379         self.body_html_file = self.filepath('body.html')
 380         self.body_pdf_file = self.filepath('body.pdf')
 381         self.body_index_file = self.filepath('body.txt')
 382         self.preamble_html_file = self.filepath('preamble.html')
 383         self.preamble_pdf_file = self.filepath('preamble.pdf')
 384         self.tail_html_file = self.filepath('tail.html')
 385         self.tail_pdf_file = self.filepath('tail.pdf')
 386         self.isbn_pdf_file = None
 387         self.pdf_file = self.filepath('final.pdf')
 388
 389         self.publish_name = bookname
 390         self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
 391         self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
 392
 393         self.book_url = config.BOOK_URL % (self.server, self.book)
 394         self.toc_url = config.TOC_URL % (self.server, self.book)
 395
 396         self.set_page_dimensions(page_settings)
 397
 398         if engine is not None:
 399             self.engine = engine
 400         self.notify_watcher()
 401
 402     def __del__(self):
 403         if os.path.exists(self.workdir) and self._try_cleanup_on_del:
 404             self._try_cleanup_on_del = False #or else you can get in bad cycles
 405             self.cleanup()
 406
 407     def __getattr__(self, attr):
 408         """catch unloaded books and load them"""
 409         #log('looking for missing attribute "%s"' % (attr))
 410         if attr == 'tree':
 411             self.load_book()
 412             return self.tree
 413         if attr == 'toc':
 414             self.load_toc()
 415             return self.toc
 416         raise AttributeError("no such member: '%s'" % attr)
 417
 418
 419     def filepath(self, fn):
 420         return os.path.join(self.workdir, fn)
 421
 422     def save_data(self, fn, data):
 423         """Save without tripping up on unicode"""
 424         if isinstance(data, unicode):
 425             data = data.encode('utf8', 'ignore')
 426         f = open(fn, 'w')
 427         f.write(data)
 428         f.close()
 429
 430     def save_tempfile(self, fn, data):
 431         """Save the data in a temporary directory that will be cleaned
 432         up when all is done.  Return the absolute file path."""
 433         fn = self.filepath(fn)
 434         self.save_data(fn, data)
 435         return fn
 436
 437     def set_page_dimensions(self, dimensions):
 438         self.maker = PageSettings(**dimensions)
 439
 440
 441     def extract_pdf_text(self):
 442         """Extract the text from the body pdf, split into pages, so
 443         that the correct page can be found to generate the table of
 444         contents."""
 445         index_pdf(self.body_pdf_file, self.body_index_file)
 446         f = open(self.body_index_file)
 447         s = unicode(f.read(), 'utf8')
 448         f.close()
 449         #pages are spearated by formfeed character "^L", "\f" or chr(12)
 450         self.text_pages = s.split("\f")
 451         #there is sometimes (probably always) an unwanted ^L at the end
 452         return len(self.text_pages)
 453
 454     def make_body_pdf(self):
 455         """Make a pdf of the HTML, using webkit"""
 456         #1. Save the html
 457         html_text = lxml.etree.tostring(self.tree, method="html")
 458         self.save_data(self.body_html_file, html_text)
 459
 460         #2. Make a pdf of it
 461         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file,
 462                                 engine=self.engine)
 463         self.notify_watcher('generate_pdf')
 464
 465         #3. extract the text for finding contents.
 466         n_pages = self.extract_pdf_text()
 467         log ("found %s pages in pdf" % n_pages)
 468         #4. resize pages, shift gutters, and rotate 180 degrees for RTL
 469         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 470         self.notify_watcher('reshape_pdf')
 471
 472         #5 add page numbers
 473         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 474                               numbers=self.page_numbers)
 475         self.notify_watcher("number_pdf")
 476         self.notify_watcher()
 477
 478     def make_preamble_pdf(self):
 479         contents = self.make_contents()
 480         inside_cover_html = self.compose_inside_cover()
 481         html = ('<html dir="%s"><head>\n'
 482                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 483                 '<link rel="stylesheet" href="%s" />\n'
 484                 '</head>\n<body>\n'
 485                 '<h1 class="frontpage">%s</h1>'
 486                 '%s\n'
 487                 '<div class="contents">%s</div>\n'
 488                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 489                 '<!--%s--></div></body></html>'
 490                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 491                      contents, self.title)
 492         self.save_data(self.preamble_html_file, html)
 493
 494         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file,
 495                                 engine=self.engine)
 496
 497         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 498
 499         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 500                             numbers=self.preamble_page_numbers,
 501                             number_start=-2)
 502
 503         self.notify_watcher()
 504
 505     def make_end_matter_pdf(self):
 506         """Make an inside back cover and a back cover.  If there is an
 507         isbn number its barcode will be put on the back cover."""
 508         if self.isbn:
 509             self.isbn_pdf_file = self.filepath('isbn.pdf')
 510             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 511             self.notify_watcher('make_barcode_pdf')
 512
 513         self.save_data(self.tail_html_file, self.compose_end_matter())
 514         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file,
 515                                 engine=self.engine)
 516
 517         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 518                                centre_end=True, even_pages=False)
 519         self.notify_watcher()
 520
 521     def make_book_pdf(self):
 522         """A convenient wrapper of a few necessary steps"""
 523         # now the Xvfb server is needed. make sure it has had long enough to get going
 524         self.wait_for_xvfb()
 525         self.make_body_pdf()
 526         self.make_preamble_pdf()
 527         self.make_end_matter_pdf()
 528
 529         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 530                     self.body_pdf_file, self.tail_pdf_file,
 531                     self.isbn_pdf_file)
 532
 533         self.notify_watcher('concatenated_pdfs')
 534         #and move it into place (what place?)
 535
 536     def rotate180(self):
 537         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 538         presses."""
 539         rotated = self.filepath('final-rotate.pdf')
 540         unrotated = self.filepath('final-pre-rotate.pdf')
 541         #leave the unrotated pdf intact at first, in case of error.
 542         rotate_pdf(self.pdf_file, rotated)
 543         os.rename(self.pdf_file, unrotated)
 544         os.rename(rotated, self.pdf_file)
 545         self.notify_watcher()
 546
 547     def publish_pdf(self):
 548         """Move the finished PDF to its final resting place"""
 549         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 550         os.rename(self.pdf_file, self.publish_file)
 551         self.notify_watcher()
 552
 553     def load_toc(self):
 554         """From the TOC.txt file create a list of TocItems with
 555         the attributes <status>, <chapter>, and <title>.
 556
 557         <status> is a number, with the following meaning:
 558
 559               0 - section heading with no chapter
 560               1 - chapter heading
 561               2 - book title
 562
 563         The TocItem object has convenience functions <is_chapter> and
 564         <is_section>.
 565
 566         <chapter> is twiki name of the chapter.
 567
 568         <title> is a human readable title for the chapter.  It is likely to
 569         differ from the title given in the chapter's <h1> heading.
 570         """
 571         f = urlopen(self.toc_url)
 572         self.toc = []
 573         while True:
 574             try:
 575                 self.toc.append(TocItem(f.next().strip(),
 576                                         f.next().strip(),
 577                                         f.next().strip()))
 578             except StopIteration:
 579                 break
 580         f.close()
 581         self.notify_watcher()
 582
 583     def load_book(self, tidy=True):
 584         """Fetch and parse the raw html of the book.  If tidy is true
 585         (default) links in the document will be made absolute."""
 586         f = urlopen(self.book_url)
 587         html = f.read()
 588         f.close()
 589         html = ('<html dir="%s"><head>\n<title>%s</title>\n'
 590                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 591                 '</head>\n<body>\n'
 592                 '%s\n'
 593                 '<div style="page-break-before: always; color:#fff;" class="unseen">'
 594                 'A FLOSSManuals book</div>\n</body></html>'
 595                 ) % (self.dir, self.book, html)
 596
 597         self.save_tempfile('raw.html', html)
 598
 599         tree = lxml.html.document_fromstring(html)
 600         if tidy:
 601             tree.make_links_absolute(self.book_url)
 602         self.tree = tree
 603         self.headings = [x for x in tree.cssselect('h1')]
 604         if self.headings:
 605             self.headings[0].set('class', "first-heading")
 606         #self.heading_texts = [x.textcontent() for x in self.headings]
 607         for h1 in self.headings:
 608             h1.title = h1.text_content().strip()
 609         self.notify_watcher()
 610
 611
 612     def load(self):
 613         """Wrapper around all necessary load methods."""
 614         self.load_book()
 615         self.load_toc()
 616
 617     def find_page(self, element, start_page=1):
 618         """Search through a page iterator and return the page
 619         number which the element probably occurs."""
 620         text = element.cookie
 621         for i, content in enumerate(self.text_pages[start_page - 1:]):
 622             log("looking for '%s' in page %s below:\n%s[...]" %
 623                 (text, i + start_page, content[:160]), debug='INDEX')
 624             #remove spaces: they can appear spuriously
 625             content = ''.join(content.split())
 626             if text in content:
 627                 return i + start_page, True
 628         #If it isn't found, return the start page so the next chapter has a chance
 629         return start_page, False
 630
 631     def make_contents(self):
 632         """Generate HTML containing the table of contents.  This can
 633         only be done after the main PDF has been made."""
 634         header = '<h1>Table of Contents</h1><table class="toc">\n'
 635         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 636                     '<td class="pagenumber">%s</td></tr>\n')
 637         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 638         footer = '\n</table>'
 639
 640         contents = []
 641
 642         chapter = 1
 643         page_num = 1
 644         subsections = [] # for the subsection heading pages.
 645
 646         headings = iter(self.headings)
 647
 648         for t in self.toc:
 649             if t.is_chapter():
 650                 try:
 651                     h1 = headings.next()
 652                 except StopIteration:
 653                     log("heading not found for %s (previous h1 missing?). Stopping" % t)
 654                     break
 655                 page_num, found = self.find_page(h1, page_num)
 656                 # sometimes the heading isn't found, which is shown as a frown
 657                 if found:
 658                     contents.append(row_tmpl % (chapter, h1.title, page_num))
 659                 else:
 660                     contents.append(row_tmpl % (chapter, h1.title, ':-('))
 661                 chapter += 1
 662             elif t.is_section():
 663                 contents.append(section_tmpl % t.title)
 664             else:
 665                 log("mystery TOC item: %s" % t)
 666
 667         doc = header + '\n'.join(contents) + footer
 668         self.notify_watcher()
 669         return doc
 670
 671     def add_section_titles(self):
 672         """Add any section heading pages that the TOC.txt file
 673         specifies.  These are sub-book, super-chapter groupings.
 674
 675         Also add initial numbers to chapters.
 676         """
 677         log(self.headings)
 678         headings = iter(self.headings)
 679         chapter = 1
 680         section = None
 681
 682         for t in self.toc:
 683             if t.is_chapter() and section is not None:
 684                 try:
 685                     h1 = headings.next()
 686                 except StopIteration:
 687                     log("heading not found for %s (previous h1 missing?)" % t)
 688                     break
 689                 item = h1.makeelement('div', Class='chapter')
 690                 log(h1.title, debug='HTMLGEN')
 691                 item.text = h1.title
 692                 _add_initial_number(item, chapter)
 693
 694                 section.append(item)
 695
 696                 if not section_placed:
 697                     log("placing section", debug='HTMLGEN')
 698                     h1.addprevious(section)
 699                     section_placed = True
 700                 else:
 701                     log("NOT placing section", debug='HTMLGEN')
 702
 703                 #put a bold number at the beginning of the h1, and a hidden cookie at the end.
 704                 _add_initial_number(h1, chapter)
 705                 _add_chapter_cookie(h1)
 706                 chapter += 1
 707
 708             elif t.is_section():
 709                 section = self.tree.makeelement('div', Class="subsection")
 710                 # section Element complains when you try to ask it whether it
 711                 # has been placed (though it does know)
 712                 section_placed = False
 713                 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
 714                 heading.set("Class", "subsection-heading")
 715                 section.append(heading)
 716
 717         self.notify_watcher()
 718
 719
 720     def add_css(self, css=None, mode='book'):
 721         """If css looks like a url, use it as a stylesheet link.
 722         Otherwise it is the CSS itself, which is saved to a temporary file
 723         and linked to."""
 724         log("css is %r" % css)
 725         htmltree = self.tree
 726         if css is None or not css.strip():
 727             defaults = SERVER_DEFAULTS[self.server]
 728             url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
 729         elif not re.match(r'^http://\S+$', css):
 730             fn = self.save_tempfile('objavi.css', css)
 731             url = 'file://' + fn
 732         else:
 733             url = css
 734         #XXX for debugging and perhaps sensible anyway
 735         #url = url.replace('file:///home/douglas/objavi2', '')
 736
 737
 738         #find the head -- it's probably first child but lets not assume.
 739         for child in htmltree:
 740             if child.tag == 'head':
 741                 head = child
 742                 break
 743         else:
 744             head = htmltree.makeelement('head')
 745             htmltree.insert(0, head)
 746
 747         link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 748         self.css_url = url
 749         self.notify_watcher()
 750         return url
 751
 752     def set_title(self, title=None):
 753         """If a string is supplied, it becomes the book's title.
 754         Otherwise a guess is made."""
 755         if title:
 756             self.title = title
 757         else:
 758             titles = [x.text_content() for x in self.tree.cssselect('title')]
 759             if titles and titles[0]:
 760                 self.title = titles[0]
 761             else:
 762                 #oh well
 763                 self.title = 'A Manual About ' + self.book
 764         return self.title
 765
 766     def _read_localised_template(self, template, fallbacks=['en']):
 767         """Try to get the template in the approriate language, otherwise in english."""
 768         for lang in [self.lang] + fallbacks:
 769             try:
 770                 fn = template % (lang)
 771                 f = open(fn)
 772                 break
 773             except IOError, e:
 774                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 775                 log(e)
 776         template = f.read()
 777         f.close()
 778         return template
 779
 780     def compose_inside_cover(self):
 781         """create the markup for the preamble inside cover."""
 782         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 783
 784         if self.isbn:
 785             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 786         else:
 787             isbn_text = ''
 788
 789         return template % {'date': time.strftime('%Y-%m-%d'),
 790                            'isbn': isbn_text,
 791                            'license': self.license,
 792                            }
 793
 794
 795     def compose_end_matter(self):
 796         """create the markup for the end_matter inside cover.  If
 797         self.isbn is not set, the html will result in a pdf that
 798         spills onto two pages.
 799         """
 800         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 801
 802         d = {'css_url': self.css_url,
 803              'title': self.title
 804              }
 805
 806         if self.isbn:
 807             d['inside_cover_style'] = ''
 808         else:
 809             d['inside_cover_style'] = 'page-break-after: always'
 810
 811         return template % d
 812
 813
 814
 815
 816     def spawn_x(self):
 817         """Start an Xvfb instance, using a new server number.  A
 818         reference to it is stored in self.xvfb, which is used to kill
 819         it when the pdf is done.
 820
 821         Note that Xvfb doesn't interact well with dbus which is
 822         present on modern desktops.
 823         """
 824         #Find an unused server number (in case two cgis are running at once)
 825         while True:
 826             servernum = random.randrange(50, 500)
 827             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 828                 break
 829
 830         self.xserver_no = ':%s' % servernum
 831
 832         authfile = self.filepath('Xauthority')
 833         os.environ['XAUTHORITY'] = authfile
 834
 835         #mcookie(1) eats into /dev/random, so avoid that
 836         from hashlib import md5
 837         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 838         mcookie = m.hexdigest()
 839
 840         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 841
 842         self.xvfb = Popen(['Xvfb', self.xserver_no,
 843                            '-screen', '0', '1024x768x24',
 844                            '-pixdepths', '32',
 845                            #'-blackpixel', '0',
 846                            #'-whitepixel', str(2 ** 24 -1),
 847                            #'+extension', 'Composite',
 848                            '-dpi', '96',
 849                            '-kb',
 850                            '-nolisten', 'tcp',
 851                            ])
 852
 853         # We need to wait a bit before the Xvfb is ready.  but the
 854         # downloads are so slow that that probably doesn't matter
 855
 856         self.xvfb_ready_time = time.time() + 2
 857
 858         os.environ['DISPLAY'] = self.xserver_no
 859         log(self.xserver_no)
 860
 861     def wait_for_xvfb(self):
 862         """wait until a previously set time before continuing.  This
 863         is so Xvfb has time to properly start."""
 864         if hasattr(self, 'xvfb'):
 865             d = self.xvfb_ready_time - time.time()
 866             if d > 0:
 867                 time.sleep(d)
 868                 self.notify_watcher()
 869
 870     def cleanup_x(self):
 871         """Try very hard to kill off Xvfb.  In addition to killing
 872         this instance's xvfb, occasionally (randomly) search for
 873         escaped Xvfb instances and kill those too."""
 874         if not hasattr(self, 'xvfb'):
 875             return
 876         check_call(['xauth', 'remove', self.xserver_no])
 877         p = self.xvfb
 878         log("trying to kill Xvfb %s" % p.pid)
 879         os.kill(p.pid, 15)
 880         for i in range(10):
 881             if p.poll() is not None:
 882                 log("%s died with %s" % (p.pid, p.poll()))
 883                 break
 884             log("%s not dead yet" % p.pid)
 885             time.sleep(0.2)
 886         else:
 887             log("Xvfb would not die! kill -9! kill -9!")
 888             os.kill(p.pid, 9)
 889
 890         if random.random() < 0.05:
 891             #kill old xvfbs occasionally, if there are any.
 892             self.kill_old_xvfbs()
 893
 894     def kill_old_xvfbs(self):
 895         """Sometimes, despite everything, Xvfb instances hang around
 896         well after they are wanted -- for example if the cgi process
 897         dies particularly badly. So kill them if they have been
 898         running for a long time."""
 899         log("running kill_old_xvfbs")
 900         p = Popen(['ps', '-C' 'Xvfb', '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 901         data = p.communicate()[0].strip()
 902         if data:
 903             lines = data.split('\n')
 904             for line in lines:
 905                 log('dealing with ps output "%s"' % line)
 906                 try:
 907                     pid, days_, hours, minutes, seconds = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$').groups()
 908                 except AttributeError:
 909                     log("Couldn't parse that line!")
 910                 # 50 minutes should be enough xvfb time for anyone
 911                 if days or hours or int(minutes) > 50:
 912                     log("going to kill pid %s" % pid)
 913                     os.kill(int(pid), 15)
 914                     time.sleep(0.5)
 915                     os.kill(int(pid), 9)
 916         self.notify_watcher()
 917
 918     def cleanup(self):
 919         self.cleanup_x()
 920         if not config.KEEP_TEMP_FILES:
 921             for fn in os.listdir(self.workdir):
 922                 os.remove(os.path.join(self.workdir, fn))
 923             os.rmdir(self.workdir)
 924         else:
 925             log("NOT removing '%s', containing the following files:" % self.workdir)
 926             log(*os.listdir(self.workdir))
 927
 928         self.notify_watcher()
 929
 930