fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This provides abstractions of texts and virtual printers and manages
   3 # their interactions.
   4 #
   5 # Copyright (C) 2009 Douglas Bagnall
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with this program; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20
  21 """Library module representing a complete FM book being turned into a
  22 PDF"""
  23
  24 import os, sys
  25 import tempfile
  26 import re, time
  27 import random
  28 from urllib2 import urlopen
  29 from subprocess import Popen, check_call, PIPE
  30
  31 import lxml.etree, lxml.html
  32 import lxml, lxml.html, lxml.etree
  33
  34 import config
  35 from config import SERVER_DEFAULTS, DEFAULT_SERVER, POINT_2_MM, PDFEDIT_MAX_PAGES
  36
  37 TMPDIR = os.path.abspath(config.TMPDIR)
  38 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  39 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  40
  41
  42 def log(*messages, **kwargs):
  43     """Send the messages to the appropriate place (stderr, or syslog).
  44     If a <debug> keyword is specified, the message is only printed if
  45     its value ias in the global DEBUG_MODES."""
  46     if 'debug' not in kwargs or config.DEBUG_ALL or kwargs['debug'] in config.DEBUG_MODES:
  47         for m in messages:
  48             try:
  49                 print >> sys.stderr, m
  50             except Exception:
  51                 print >> sys.stderr, repr(m)
  52
  53 def _add_initial_number(e, n):
  54     """Put a styled chapter number n at the beginning of element e."""
  55     initial = e.makeelement("strong", Class="initial")
  56     e.insert(0, initial)
  57     initial.tail = ' '
  58     if e.text is not None:
  59         initial.tail += e.text
  60     e.text = ''
  61     initial.text = "%s." % n
  62
  63
  64 class TocItem(object):
  65     """This makes sense of the tuples from TOC.txt files"""
  66     def __init__(self, status, chapter, title):
  67         # status is
  68         #  0 - section heading with no chapter
  69         #  1 - chapter heading
  70         #  2 - book title
  71         #
  72         # chapter is twiki name of the chapter
  73         # title is a human readable name of the chapter.
  74         self.status = status
  75         self.chapter = chapter
  76         self.title = title
  77
  78     def is_chapter(self):
  79         return self.status == '1'
  80
  81     def is_section(self):
  82         return self.status == '0'
  83
  84     def __str__(self):
  85         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
  86
  87
  88 def run(cmd):
  89     try:
  90         p = Popen(cmd, stdout=PIPE, stderr=PIPE)
  91         out, err = p.communicate()
  92     except Exception:
  93         log("Failed on command: %r" % cmd)
  94         raise
  95     log("%s\n%s returned %s and produced\nstdout:%s\nstderr:%s" %
  96         (' '.join(cmd), cmd[0], p.poll(), out, err))
  97
  98
  99 def find_containing_paper(w, h):
 100     size = None
 101     for name, pw, ph in config.PAPER_SIZES:
 102         if pw >= w and ph >= h:
 103             mw = (pw - w) * 0.5
 104             mh = (ph - h) * 0.5
 105             return (name, mw, mh)
 106
 107     raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
 108                      (w * POINT_2_MM, h * POINT_2_MM))
 109
 110
 111
 112 class PageSettings(object):
 113     """Calculates and wraps commands for the generation and processing
 114     of PDFs"""
 115     def __init__(self, pointsize, **kwargs):
 116         # the formulas for default gutters, margins and column margins
 117         # are quite ad-hoc and certainly improvable.
 118         self.width, self.height = pointsize
 119         self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
 120         self.grey_scale = 'grey_scale' in kwargs
 121
 122         self.engine = kwargs.get('engine', config.DEFAULT_ENGINE)
 123         # All measurements in points unless otherwise stated
 124         # user interaction is in *mm*, but is converted in objavi2.py
 125         default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
 126         default_gutter = (config.BASE_GUTTER + config.PROPORTIONAL_GUTTER * self.width)
 127
 128         self.top_margin = kwargs.get('top_margin', default_margin)
 129         self.side_margin = kwargs.get('side_margin', default_margin)
 130         self.bottom_margin = kwargs.get('bottom_margin', default_margin)
 131         self.gutter = kwargs.get('gutter', default_gutter)
 132
 133         self.columns = kwargs.get('columns', 1)
 134         if self.columns == 'auto': #default for newspapers is to work out columns
 135             self.columns = int(self.width // config.MIN_COLUMN_WIDTH)
 136
 137         self.column_margin = kwargs.get('column_margin',
 138                                         default_margin * 2 / (5.0 + self.columns))
 139
 140         self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
 141         self.number_margin = self.side_margin
 142
 143         # calculate margins in mm for browsers
 144         self.margins = []
 145         for m, clip in ((self.top_margin, clipy),
 146                         (self.side_margin, clipx + 0.5 * self.gutter),
 147                         (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
 148                         (self.side_margin, clipx + 0.5 * self.gutter),
 149                         ):
 150             self.margins.append((m + clip) * POINT_2_MM)
 151
 152         self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
 153
 154         if 'PDFGEN' in config.DEBUG_MODES:
 155             log("making PageSettings with:")
 156             for x in locals().iteritems():
 157                 log("%s: %s" % x, debug='PDFGEN')
 158             for x in dir(self):
 159                 if not x.startswith('__'):
 160                     log("self.%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
 161
 162
 163
 164     def _webkit_command(self, html, pdf, outline=False):
 165         m = [str(x) for x in self.margins]
 166         outline_args = ['--outline'] * outline
 167         greyscale_args = ['-g'] * self.grey_scale
 168         cmd = ([config.WKHTMLTOPDF, '-q', '-s', self.papersize,
 169                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
 170                '-d', '100'] + outline_args + greyscale_args +
 171                config.WKHTMLTOPDF_EXTRA_COMMANDS + [html, pdf])
 172         log(' '.join(cmd))
 173         return cmd
 174
 175     def _gecko_command(self, html, pdf, outline=False):
 176         m = [str(x) for x in self.margins]
 177         #firefox -P pdfprint -print URL -printprinter "printer_settings"
 178         cmd = [config.FIREFOX, '-P', 'pdfprint', '-print',
 179                html, '-printprinter', self.moz_printer]
 180         log(' '.join(cmd))
 181         return cmd
 182
 183     def make_raw_pdf(self, html, pdf, outline=False):
 184         func = getattr(self, '_%s_command' % self.engine)
 185         if self.columns == 1:
 186             cmd = func(html, pdf, outline=outline)
 187             run(cmd)
 188         else:
 189             printable_width = self.width - 2.0 * self.side_margin - self.gutter
 190             column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
 191             page_width = column_width + self.column_margin
 192             side_margin = self.column_margin * 0.5
 193             if 'PDFGEN' in config.DEBUG_MODES:
 194                 log("making columns with:")
 195                 for k, v in locals().iteritems():
 196                     log("%s: %r" % (k, v))
 197                 for k in ('width', 'side_margin', 'gutter', 'column_margin', 'columns', 'height'):
 198                     log("self.%s: %r" % (k, getattr(self, k)))
 199
 200             columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
 201                                        gutter=0, top_margin=self.top_margin,
 202                                        side_margin=side_margin,
 203                                        bottom_margin=self.bottom_margin,
 204                                        grey_scale=self.grey_scale,
 205                                        engine=self.engine
 206                                        )
 207
 208             column_pdf = pdf[:-4] + '-single-column.pdf'
 209             columnmaker.make_raw_pdf(html, column_pdf, outline=outline)
 210             columnmaker.reshape_pdf(column_pdf)
 211             cmd = ['pdfnup',
 212                    '--nup', '%sx1' % int(self.columns),
 213                    '--paper', self.papersize.lower() + 'paper',
 214                    '--outfile', pdf,
 215                    '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
 216                    '--noautoscale', 'true',
 217                    '--orient', 'portrait',
 218                    #'--tidy', 'false',
 219                    column_pdf
 220                    ]
 221
 222             run(cmd)
 223
 224
 225
 226     def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
 227                     even_pages=True):
 228         """Spin the pdf for RTL text, resize it to the right size, and
 229         shift the gutter left and right"""
 230         ops = 'resize'
 231         if self.gutter:
 232             ops += ',shift'
 233         if even_pages:
 234             ops += ',even_pages'
 235         gutter = self.gutter
 236         if dir == 'RTL':
 237             gutter = -gutter
 238         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 239                'dir=%s' % dir,
 240                'filename=%s' % pdf,
 241                'output_filename=%s' % pdf,
 242                'operation=%s' % ops,
 243                'width=%s' % self.width,
 244                'height=%s' % self.height,
 245                'offset=%s' % gutter,
 246                'centre_start=%s' % centre_start,
 247                'centre_end=%s' % centre_end,
 248                ]
 249         run(cmd)
 250
 251     def _number_pdf(self, pdf, numbers='latin', dir='LTR',
 252                     number_start=1):
 253         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 254                'operation=page_numbers',
 255                'dir=%s' % dir,
 256                'filename=%s' % pdf,
 257                'output_filename=%s' % pdf,
 258                'number_start=%s' % number_start,
 259                'number_style=%s' % numbers,
 260                'number_bottom=%s' % self.number_bottom,
 261                'number_margin=%s' % self.number_margin,
 262                ]
 263         run(cmd)
 264
 265     def number_pdf(self, pdf, pages, **kwargs):
 266         # if there are too many pages for pdfedit to handle in one go,
 267         # split the job into bits.  <pages> may not be exact
 268         if pages is None or pages <= PDFEDIT_MAX_PAGES:
 269             self._number_pdf(pdf, **kwargs)
 270         else:
 271             # section_size must be even
 272             sections = pages // PDFEDIT_MAX_PAGES + 1
 273             section_size = (pages // sections + 2) & ~1
 274
 275             pdf_sections = []
 276             s = kwargs.pop('number_start', 1)
 277             while s < pages:
 278                 e = s + section_size - 1
 279                 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
 280                 if e < pages - 1:
 281                     page_range = '%s-%s' % (s, e)
 282                 else:
 283                     page_range = '%s-end' % s
 284                 run(['pdftk',
 285                      pdf,
 286                      'cat',
 287                      page_range,
 288                      'output',
 289                      pdf_section,
 290                      ])
 291                 self._number_pdf(pdf_section, number_start=s, **kwargs)
 292                 pdf_sections.append(pdf_section)
 293                 s = e + 1
 294
 295             concat_pdfs(pdf, *pdf_sections)
 296
 297     def make_barcode_pdf(self, isbn, pdf, corner='br'):
 298         """Put an ISBN barcode in a corner of a single blank page."""
 299
 300         position = '%s,%s,%s,%s,%s' %(corner, self.width, self.height, self.side_margin, self.bottom_margin)
 301         cmd1 = [config.BOOKLAND,
 302                 '--position', position,
 303                 str(isbn)]
 304         cmd2 = ['ps2pdf',
 305                 '-dFIXEDMEDIA',
 306                 '-dDEVICEWIDTHPOINTS=%s' % self.width,
 307                 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
 308                 '-', pdf]
 309
 310         p1 = Popen(cmd1, stdout=PIPE)
 311         p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
 312         out, err = p2.communicate()
 313
 314         log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
 315         log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
 316
 317
 318 def count_pdf_pages(pdf):
 319     """How many pages in the PDF?"""
 320     #XXX could also use python-pypdf or python-poppler
 321     cmd = ('pdfinfo', pdf)
 322     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 323     out, err = p.communicate()
 324     m = re.search(r'^\s*Pages:\s*(\d+)\s*$', out, re.MULTILINE)
 325     return int(m.group(1))
 326
 327
 328 def concat_pdfs(destination, *pdfs):
 329     """Join all the named pdfs together into one and save it as <name>"""
 330     cmd = ['pdftk']
 331     cmd.extend(x for x in pdfs if x is not None)
 332     cmd += ['cat', 'output', destination]
 333     run(cmd)
 334
 335 def index_pdf(pdf, text=None):
 336     """Use pdftotext to extract utf-8 text from a pdf, using ^L to
 337     separate pages."""
 338     if text is None:
 339         text = pdf + '.index.txt'
 340     cmd = ['pdftotext',
 341            #'-layout', #keeps more original formatting
 342            pdf,
 343            text]
 344     run(cmd)
 345     return text
 346
 347 def rotate_pdf(pdfin, pdfout):
 348     """Turn the PDF on its head"""
 349     cmd = ['pdftk', pdfin,
 350            'cat',
 351            '1-endD',
 352            'output',
 353            pdfout
 354            ]
 355     run(cmd)
 356
 357 def parse_outline(pdf, level_threshold):
 358     """Create a structure reflecting the outline of a PDF.
 359     A chapter heading looks like this:
 360
 361     BookmarkTitle: 2. What is sound?
 362     BookmarkLevel: 1
 363     BookmarkPageNumber: 3
 364     """
 365     cmd = ('pdftk', pdf, 'dump_data')
 366     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 367     outline, err = p.communicate()
 368     lines = (x.strip() for x in outline.split('\n') if x.strip())
 369     contents = []
 370
 371     def extract(expected, conv=str.strip):
 372         line = lines.next()
 373         try:
 374             k, v = line.split(':', 1)
 375             if k == expected:
 376                 return conv(v)
 377         except ValueError:
 378             log("trouble with line %r" %line)
 379
 380     #There are a few useless variables, then the pagecount, then the contents.
 381     #The pagecount is useful, so pick it up first.
 382     page_count = None
 383     while page_count == None:
 384         page_count = extract('NumberOfPages', int)
 385
 386     try:
 387         while True:
 388             title = extract('BookmarkTitle')
 389             if title is not None:
 390                 level = extract('BookmarkLevel', int)
 391                 pagenum = extract('BookmarkPageNumber', int)
 392                 if level <= level_threshold and None not in (level, pagenum):
 393                     contents.append((title, level, pagenum))
 394     except StopIteration:
 395         pass
 396
 397     return contents, outline, page_count
 398
 399
 400 class Book(object):
 401     page_numbers = 'latin'
 402     preamble_page_numbers = 'roman'
 403
 404     def notify_watcher(self, message=None):
 405         if self.watcher:
 406             if  message is None:
 407                 #message is the name of the caller
 408                 #XXX look at using inspect module
 409                 import traceback
 410                 message = traceback.extract_stack(None, 2)[0][2]
 411             log("notify_watcher called with '%s'" % message)
 412             self.watcher(message)
 413
 414     def __enter__(self):
 415         return self
 416
 417     def __exit__(self, exc_type, exc_value, traceback):
 418         self.cleanup()
 419         #could deal with exceptions here and return true
 420
 421     def __init__(self, book, server, bookname,
 422                  page_settings=None, watcher=None, isbn=None,
 423                  license=config.DEFAULT_LICENSE):
 424         log("*** Starting new book %s ***" % bookname)
 425         self.book = book
 426         self.server = server
 427         self.watcher = watcher
 428         self.isbn = isbn
 429         self.license = license
 430         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 431         os.chmod(self.workdir, 0755)
 432         defaults = SERVER_DEFAULTS[server]
 433         self.lang = defaults['lang']
 434         self.dir  = defaults['dir']
 435
 436         self.body_html_file = self.filepath('body.html')
 437         self.body_pdf_file = self.filepath('body.pdf')
 438         self.body_index_file = self.filepath('body.txt')
 439         self.preamble_html_file = self.filepath('preamble.html')
 440         self.preamble_pdf_file = self.filepath('preamble.pdf')
 441         self.tail_html_file = self.filepath('tail.html')
 442         self.tail_pdf_file = self.filepath('tail.pdf')
 443         self.isbn_pdf_file = None
 444         self.pdf_file = self.filepath('final.pdf')
 445         self.body_odt_file = self.filepath('body.odt')
 446
 447         self.publish_name = bookname
 448         self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
 449         self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
 450
 451         self.book_url = config.BOOK_URL % (self.server, self.book)
 452         self.toc_url = config.TOC_URL % (self.server, self.book)
 453         if page_settings is not None:
 454             self.maker = PageSettings(**page_settings)
 455
 456         self.notify_watcher()
 457
 458     if config.TRY_BOOK_CLEANUP_ON_DEL:
 459         #Dont even define __del__ if it is not used.
 460         _try_cleanup_on_del = True
 461         def __del__(self):
 462             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 463                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 464                 self.cleanup()
 465
 466     def __getattr__(self, attr):
 467         """catch unloaded books and load them"""
 468         #log('looking for missing attribute "%s"' % (attr))
 469         if attr == 'tree':
 470             self.load_book()
 471             return self.tree
 472         if attr == 'toc':
 473             self.load_toc()
 474             return self.toc
 475         raise AttributeError("no such member: '%s'" % attr)
 476
 477
 478     def filepath(self, fn):
 479         return os.path.join(self.workdir, fn)
 480
 481     def save_data(self, fn, data):
 482         """Save without tripping up on unicode"""
 483         if isinstance(data, unicode):
 484             data = data.encode('utf8', 'ignore')
 485         f = open(fn, 'w')
 486         f.write(data)
 487         f.close()
 488
 489     def save_tempfile(self, fn, data):
 490         """Save the data in a temporary directory that will be cleaned
 491         up when all is done.  Return the absolute file path."""
 492         fn = self.filepath(fn)
 493         self.save_data(fn, data)
 494         return fn
 495
 496     def make_oo_doc(self):
 497         """Make an openoffice document, using the html2odt script."""
 498         self.wait_for_xvfb()
 499         html_text = lxml.etree.tostring(self.tree, method="html")
 500         self.save_data(self.body_html_file, html_text)
 501         run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
 502         log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
 503         os.rename(self.body_odt_file, self.publish_file)
 504         self.notify_watcher()
 505
 506     def extract_pdf_outline(self):
 507         self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
 508         for x in self.outline_contents:
 509             log(x)
 510         self.notify_watcher()
 511         return number_of_pages
 512
 513     def make_body_pdf(self):
 514         """Make a pdf of the HTML, using webkit"""
 515         #1. Save the html
 516         html_text = lxml.etree.tostring(self.tree, method="html")
 517         self.save_data(self.body_html_file, html_text)
 518
 519         #2. Make a pdf of it
 520         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
 521         self.notify_watcher('generate_pdf')
 522
 523         n_pages = self.extract_pdf_outline()
 524
 525         log ("found %s pages in pdf" % n_pages)
 526         #4. resize pages, shift gutters, even pages
 527         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 528         self.notify_watcher('reshape_pdf')
 529
 530         #5 add page numbers
 531         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 532                               numbers=self.page_numbers)
 533         self.notify_watcher("number_pdf")
 534         self.notify_watcher()
 535
 536     def make_preamble_pdf(self):
 537         contents = self.make_contents()
 538         inside_cover_html = self.compose_inside_cover()
 539         html = ('<html dir="%s"><head>\n'
 540                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 541                 '<link rel="stylesheet" href="%s" />\n'
 542                 '</head>\n<body>\n'
 543                 '<h1 class="frontpage">%s</h1>'
 544                 '%s\n'
 545                 '<div class="contents">%s</div>\n'
 546                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 547                 '<!--%s--></div></body></html>'
 548                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 549                      contents, self.title)
 550         self.save_data(self.preamble_html_file, html)
 551
 552         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
 553
 554
 555         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 556
 557         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 558                             numbers=self.preamble_page_numbers,
 559                             number_start=-2)
 560
 561         self.notify_watcher()
 562
 563     def make_end_matter_pdf(self):
 564         """Make an inside back cover and a back cover.  If there is an
 565         isbn number its barcode will be put on the back cover."""
 566         if self.isbn:
 567             self.isbn_pdf_file = self.filepath('isbn.pdf')
 568             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 569             self.notify_watcher('make_barcode_pdf')
 570
 571         self.save_data(self.tail_html_file, self.compose_end_matter())
 572         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
 573
 574         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 575                                centre_end=True, even_pages=False)
 576         self.notify_watcher()
 577
 578     def make_book_pdf(self):
 579         """A convenient wrapper of a few necessary steps"""
 580         # now the Xvfb server is needed. make sure it has had long enough to get going
 581         self.wait_for_xvfb()
 582         self.make_body_pdf()
 583         self.make_preamble_pdf()
 584         self.make_end_matter_pdf()
 585
 586         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 587                     self.body_pdf_file, self.tail_pdf_file,
 588                     self.isbn_pdf_file)
 589
 590         self.notify_watcher('concatenated_pdfs')
 591
 592
 593     def make_simple_pdf(self, mode):
 594         """Make a simple pdf document without contents or separate
 595         title page.  This is used for multicolumn newspapers and for
 596         web-destined pdfs."""
 597         self.wait_for_xvfb()
 598         #0. Add heading to begining of html
 599         body = list(self.tree.cssselect('body'))[0]
 600         e = body.makeelement('h1', {'id': 'book-title'})
 601         e.text = self.title
 602         body.insert(0, e)
 603         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 604         e.addnext(intro)
 605
 606         #0.5 adjust parameters to suit the particular kind of output
 607         if mode == 'web':
 608             self.maker.gutter = 0
 609
 610         #1. Save the html
 611         html_text = lxml.etree.tostring(self.tree, method="html")
 612         self.save_data(self.body_html_file, html_text)
 613
 614         #2. Make a pdf of it (direct to to final pdf)
 615         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
 616         self.notify_watcher('generate_pdf')
 617         #n_pages = self.extract_pdf_outline()
 618         n_pages = count_pdf_pages(self.pdf_file)
 619
 620         if mode != 'web':
 621             #3. resize pages and shift gutters.
 622             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 623             self.notify_watcher('reshape_pdf')
 624
 625             #4. add page numbers
 626             self.maker.number_pdf(self.pdf_file, n_pages,
 627                                   dir=self.dir, numbers=self.page_numbers)
 628             self.notify_watcher("number_pdf")
 629         self.notify_watcher()
 630
 631
 632     def rotate180(self):
 633         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 634         presses."""
 635         rotated = self.filepath('final-rotate.pdf')
 636         unrotated = self.filepath('final-pre-rotate.pdf')
 637         #leave the unrotated pdf intact at first, in case of error.
 638         rotate_pdf(self.pdf_file, rotated)
 639         os.rename(self.pdf_file, unrotated)
 640         os.rename(rotated, self.pdf_file)
 641         self.notify_watcher()
 642
 643     def publish_pdf(self):
 644         """Move the finished PDF to its final resting place"""
 645         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 646         os.rename(self.pdf_file, self.publish_file)
 647         self.notify_watcher()
 648
 649     def load_toc(self):
 650         """From the TOC.txt file create a list of TocItems with
 651         the attributes <status>, <chapter>, and <title>.
 652
 653         <status> is a number, with the following meaning:
 654
 655               0 - section heading with no chapter
 656               1 - chapter heading
 657               2 - book title
 658
 659         The TocItem object has convenience functions <is_chapter> and
 660         <is_section>.
 661
 662         <chapter> is twiki name of the chapter.
 663
 664         <title> is a human readable title for the chapter.  It is likely to
 665         differ from the title given in the chapter's <h1> heading.
 666         """
 667         f = urlopen(self.toc_url)
 668         self.toc = []
 669         while True:
 670             try:
 671                 self.toc.append(TocItem(f.next().strip(),
 672                                         f.next().strip(),
 673                                         f.next().strip()))
 674             except StopIteration:
 675                 break
 676         f.close()
 677         self.notify_watcher()
 678
 679     def load_book(self, tidy=True):
 680         """Fetch and parse the raw html of the book.  If tidy is true
 681         (default) links in the document will be made absolute."""
 682         f = urlopen(self.book_url)
 683         html = f.read()
 684         f.close()
 685         html = ('<html dir="%s"><head>\n<title>%s</title>\n'
 686                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 687                 '</head>\n<body>\n'
 688                 '%s\n'
 689                 '<div style="page-break-before: always; color:#fff;" class="unseen">'
 690                 'A FLOSSManuals book</div>\n</body></html>'
 691                 ) % (self.dir, self.book, html)
 692
 693         self.save_tempfile('raw.html', html)
 694
 695         tree = lxml.html.document_fromstring(html)
 696         if tidy:
 697             tree.make_links_absolute(self.book_url)
 698         self.tree = tree
 699         self.headings = [x for x in tree.cssselect('h1')]
 700         if self.headings:
 701             self.headings[0].set('class', "first-heading")
 702         for h1 in self.headings:
 703             h1.title = h1.text_content().strip()
 704         self.notify_watcher()
 705
 706     def load(self):
 707         """Wrapper around all necessary load methods."""
 708         self.load_book()
 709         self.load_toc()
 710
 711     def make_contents(self):
 712         """Generate HTML containing the table of contents.  This can
 713         only be done after the main PDF has been made."""
 714         header = '<h1>Table of Contents</h1><table class="toc">\n'
 715         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 716                     '<td class="pagenumber">%s</td></tr>\n')
 717         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 718         footer = '\n</table>'
 719
 720         contents = []
 721
 722         chapter = 1
 723         page_num = 1
 724         subsections = [] # for the subsection heading pages.
 725
 726         outline_contents = iter(self.outline_contents)
 727         headings = iter(self.headings)
 728
 729         for t in self.toc:
 730             if t.is_chapter():
 731                 try:
 732                     h1 = headings.next()
 733                 except StopIteration:
 734                     log("heading not found for %s (previous h1 missing?). Stopping" % t)
 735                     break
 736                 h1_text, level, page_num = outline_contents.next()
 737                 log("%r %r" % (h1.title, h1_text))
 738                 contents.append(row_tmpl % (chapter, h1.title, page_num))
 739                 chapter += 1
 740             elif t.is_section():
 741                 contents.append(section_tmpl % t.title)
 742             else:
 743                 log("mystery TOC item: %s" % t)
 744
 745         doc = header + '\n'.join(contents) + footer
 746         self.notify_watcher()
 747         return doc
 748
 749     def add_section_titles(self):
 750         """Add any section heading pages that the TOC.txt file
 751         specifies.  These are sub-book, super-chapter groupings.
 752
 753         Also add initial numbers to chapters.
 754         """
 755         headings = iter(self.headings)
 756         chapter = 1
 757         section = None
 758
 759         for t in self.toc:
 760             if t.is_chapter() and section is not None:
 761                 try:
 762                     h1 = headings.next()
 763                 except StopIteration:
 764                     log("heading not found for %s (previous h1 missing?)" % t)
 765                     break
 766                 item = h1.makeelement('div', Class='chapter')
 767                 log(h1.title, debug='HTMLGEN')
 768                 item.text = h1.title
 769                 _add_initial_number(item, chapter)
 770
 771                 section.append(item)
 772
 773                 if not section_placed:
 774                     log("placing section", debug='HTMLGEN')
 775                     h1.addprevious(section)
 776                     section_placed = True
 777                 else:
 778                     log("NOT placing section", debug='HTMLGEN')
 779
 780                 #put a bold number at the beginning of the h1.
 781                 _add_initial_number(h1, chapter)
 782                 chapter += 1
 783
 784             elif t.is_section():
 785                 section = self.tree.makeelement('div', Class="subsection")
 786                 # section Element complains when you try to ask it whether it
 787                 # has been placed (though it does know)
 788                 section_placed = False
 789                 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
 790                 heading.set("Class", "subsection-heading")
 791                 section.append(heading)
 792
 793         self.notify_watcher()
 794
 795
 796     def add_css(self, css=None, mode='book'):
 797         """If css looks like a url, use it as a stylesheet link.
 798         Otherwise it is the CSS itself, which is saved to a temporary file
 799         and linked to."""
 800         log("css is %r" % css)
 801         htmltree = self.tree
 802         if css is None or not css.strip():
 803             defaults = SERVER_DEFAULTS[self.server]
 804             url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
 805         elif not re.match(r'^http://\S+$', css):
 806             fn = self.save_tempfile('objavi.css', css)
 807             url = 'file://' + fn
 808         else:
 809             url = css
 810         #XXX for debugging and perhaps sensible anyway
 811         #url = url.replace('file:///home/douglas/objavi2', '')
 812
 813
 814         #find the head -- it's probably first child but lets not assume.
 815         for child in htmltree:
 816             if child.tag == 'head':
 817                 head = child
 818                 break
 819         else:
 820             head = htmltree.makeelement('head')
 821             htmltree.insert(0, head)
 822
 823         link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 824         self.css_url = url
 825         self.notify_watcher()
 826         return url
 827
 828     def set_title(self, title=None):
 829         """If a string is supplied, it becomes the book's title.
 830         Otherwise a guess is made."""
 831         if title:
 832             self.title = title
 833         else:
 834             titles = [x.text_content() for x in self.tree.cssselect('title')]
 835             if titles and titles[0]:
 836                 self.title = titles[0]
 837             else:
 838                 #oh well
 839                 self.title = 'A Manual About ' + self.book
 840         return self.title
 841
 842     def _read_localised_template(self, template, fallbacks=['en']):
 843         """Try to get the template in the approriate language, otherwise in english."""
 844         for lang in [self.lang] + fallbacks:
 845             try:
 846                 fn = template % (lang)
 847                 f = open(fn)
 848                 break
 849             except IOError, e:
 850                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 851                 log(e)
 852         template = f.read()
 853         f.close()
 854         return template
 855
 856     def compose_inside_cover(self):
 857         """create the markup for the preamble inside cover."""
 858         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 859
 860         if self.isbn:
 861             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 862         else:
 863             isbn_text = ''
 864
 865         return template % {'date': time.strftime('%Y-%m-%d'),
 866                            'isbn': isbn_text,
 867                            'license': self.license,
 868                            }
 869
 870
 871     def compose_end_matter(self):
 872         """create the markup for the end_matter inside cover.  If
 873         self.isbn is not set, the html will result in a pdf that
 874         spills onto two pages.
 875         """
 876         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 877
 878         d = {'css_url': self.css_url,
 879              'title': self.title
 880              }
 881
 882         if self.isbn:
 883             d['inside_cover_style'] = ''
 884         else:
 885             d['inside_cover_style'] = 'page-break-after: always'
 886
 887         return template % d
 888
 889
 890
 891
 892     def spawn_x(self):
 893         """Start an Xvfb instance, using a new server number.  A
 894         reference to it is stored in self.xvfb, which is used to kill
 895         it when the pdf is done.
 896
 897         Note that Xvfb doesn't interact well with dbus which is
 898         present on modern desktops.
 899         """
 900         #Find an unused server number (in case two cgis are running at once)
 901         while True:
 902             servernum = random.randrange(50, 500)
 903             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 904                 break
 905
 906         self.xserver_no = ':%s' % servernum
 907
 908         authfile = self.filepath('Xauthority')
 909         os.environ['XAUTHORITY'] = authfile
 910
 911         #mcookie(1) eats into /dev/random, so avoid that
 912         from hashlib import md5
 913         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 914         mcookie = m.hexdigest()
 915
 916         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 917
 918         self.xvfb = Popen(['Xvfb', self.xserver_no,
 919                            '-screen', '0', '1024x768x24',
 920                            '-pixdepths', '32',
 921                            #'-blackpixel', '0',
 922                            #'-whitepixel', str(2 ** 24 -1),
 923                            #'+extension', 'Composite',
 924                            '-dpi', '96',
 925                            '-kb',
 926                            '-nolisten', 'tcp',
 927                            ])
 928
 929         # We need to wait a bit before the Xvfb is ready.  but the
 930         # downloads are so slow that that probably doesn't matter
 931
 932         self.xvfb_ready_time = time.time() + 2
 933
 934         os.environ['DISPLAY'] = self.xserver_no
 935         log(self.xserver_no)
 936
 937     def wait_for_xvfb(self):
 938         """wait until a previously set time before continuing.  This
 939         is so Xvfb has time to properly start."""
 940         if hasattr(self, 'xvfb'):
 941             d = self.xvfb_ready_time - time.time()
 942             if d > 0:
 943                 time.sleep(d)
 944                 self.notify_watcher()
 945
 946     def cleanup_x(self):
 947         """Try very hard to kill off Xvfb.  In addition to killing
 948         this instance's xvfb, occasionally (randomly) search for
 949         escaped Xvfb instances and kill those too."""
 950         if not hasattr(self, 'xvfb'):
 951             return
 952         check_call(['xauth', 'remove', self.xserver_no])
 953         p = self.xvfb
 954         log("trying to kill Xvfb %s" % p.pid)
 955         os.kill(p.pid, 15)
 956         for i in range(10):
 957             if p.poll() is not None:
 958                 log("%s died with %s" % (p.pid, p.poll()))
 959                 break
 960             log("%s not dead yet" % p.pid)
 961             time.sleep(0.2)
 962         else:
 963             log("Xvfb would not die! kill -9! kill -9!")
 964             os.kill(p.pid, 9)
 965
 966         if random.random() < 0.1:
 967             # occasionally kill old xvfbs and soffices, if there are any.
 968             self.kill_old_processes()
 969
 970     def kill_old_processes(self):
 971         """Sometimes, despite everything, Xvfb or soffice instances
 972         hang around well after they are wanted -- for example if the
 973         cgi process dies particularly badly. So kill them if they have
 974         been running for a long time."""
 975         log("running kill_old_processes")
 976         p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
 977                    '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 978         data = p.communicate()[0].strip()
 979         if data:
 980             lines = data.split('\n')
 981             for line in lines:
 982                 log('dealing with ps output "%s"' % line)
 983                 try:
 984                     pid, days, hours, minutes, seconds \
 985                          = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
 986                 except AttributeError:
 987                     log("Couldn't parse that line!")
 988                 # 50 minutes should be enough xvfb time for anyone
 989                 if days or hours or int(minutes) > 50:
 990                     log("going to kill pid %s" % pid)
 991                     os.kill(int(pid), 15)
 992                     time.sleep(0.5)
 993                     try:
 994                         os.kill(int(pid), 9)
 995                         log('killing %s with -9')
 996                     except OSError, e:
 997                         pass
 998         self.notify_watcher()
 999
1000     def cleanup(self):
1001         self.cleanup_x()
1002         if not config.KEEP_TEMP_FILES:
1003             for fn in os.listdir(self.workdir):
1004                 os.remove(os.path.join(self.workdir, fn))
1005             os.rmdir(self.workdir)
1006         else:
1007             log("NOT removing '%s', containing the following files:" % self.workdir)
1008             log(*os.listdir(self.workdir))
1009
1010         self.notify_watcher()
1011
1012