fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This provides abstractions of texts and virtual printers and manages
   3 # their interactions.
   4 #
   5 # Copyright (C) 2009 Douglas Bagnall
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with this program; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20
  21 """Library module representing a complete FM book being turned into a
  22 PDF"""
  23
  24 import os, sys
  25 import tempfile
  26 import re, time
  27 import random
  28 from urllib2 import urlopen
  29 from subprocess import Popen, check_call, PIPE
  30
  31 import lxml.etree, lxml.html
  32 import lxml, lxml.html, lxml.etree
  33
  34 import config
  35 from config import SERVER_DEFAULTS, DEFAULT_SERVER, POINT_2_MM, PDFEDIT_MAX_PAGES
  36
  37 TMPDIR = os.path.abspath(config.TMPDIR)
  38 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  39 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  40
  41
  42 def log(*messages, **kwargs):
  43     """Send the messages to the appropriate place (stderr, or syslog).
  44     If a <debug> keyword is specified, the message is only printed if
  45     its value ias in the global DEBUG_MODES."""
  46     if 'debug' not in kwargs or config.DEBUG_ALL or kwargs['debug'] in config.DEBUG_MODES:
  47         for m in messages:
  48             try:
  49                 print >> sys.stderr, m
  50             except Exception:
  51                 print >> sys.stderr, repr(m)
  52
  53 def _add_initial_number(e, n):
  54     """Put a styled chapter number n at the beginning of element e."""
  55     initial = e.makeelement("strong", Class="initial")
  56     e.insert(0, initial)
  57     initial.tail = ' '
  58     if e.text is not None:
  59         initial.tail += e.text
  60     e.text = ''
  61     initial.text = "%s." % n
  62
  63
  64 class TocItem(object):
  65     """This makes sense of the tuples from TOC.txt files"""
  66     def __init__(self, status, chapter, title):
  67         # status is
  68         #  0 - section heading with no chapter
  69         #  1 - chapter heading
  70         #  2 - book title
  71         #
  72         # chapter is twiki name of the chapter
  73         # title is a human readable name of the chapter.
  74         self.status = status
  75         self.chapter = chapter
  76         self.title = title
  77
  78     def is_chapter(self):
  79         return self.status == '1'
  80
  81     def is_section(self):
  82         return self.status == '0'
  83
  84     def __str__(self):
  85         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
  86
  87
  88 def run(cmd):
  89     try:
  90         p = Popen(cmd, stdout=PIPE, stderr=PIPE)
  91         out, err = p.communicate()
  92     except Exception:
  93         log("Failed on command: %r" % cmd)
  94         raise
  95     log("%s\n%s returned %s and produced\nstdout:%s\nstderr:%s" %
  96         (' '.join(cmd), cmd[0], p.poll(), out, err))
  97
  98
  99 def find_containing_paper(w, h):
 100     size = None
 101     for name, pw, ph in config.PAPER_SIZES:
 102         if pw >= w and ph >= h:
 103             mw = (pw - w) * 0.5
 104             mh = (ph - h) * 0.5
 105             return (name, mw, mh)
 106
 107     raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
 108                      (w * POINT_2_MM, h * POINT_2_MM))
 109
 110
 111
 112 class PageSettings(object):
 113     """Calculates and wraps commands for the generation and processing
 114     of PDFs"""
 115     def __init__(self, pointsize, **kwargs):
 116         # the formulas for default gutters, margins and column margins
 117         # are quite ad-hoc and certainly improvable.
 118
 119         self.width, self.height = pointsize
 120         self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
 121         self.grey_scale = 'grey_scale' in kwargs
 122
 123         # All measurements in points unless otherwise stated
 124         # user interaction is in *mm*, but is converted in objavi2.py
 125         default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
 126         default_gutter = (config.BASE_GUTTER + config.PROPORTIONAL_GUTTER * self.width)
 127
 128         self.top_margin = kwargs.get('top_margin', default_margin)
 129         self.side_margin = kwargs.get('side_margin', default_margin)
 130         self.bottom_margin = kwargs.get('bottom_margin', default_margin)
 131         self.gutter = kwargs.get('gutter', default_gutter)
 132
 133         self.columns = kwargs.get('columns', 1)
 134         if self.columns == 'auto': #default for newspapers is to work out columns
 135             self.columns = int(self.width // config.MIN_COLUMN_WIDTH)
 136
 137         self.column_margin = kwargs.get('column_margin',
 138                                         default_margin * 2 / (5.0 + self.columns))
 139
 140         self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
 141         self.number_margin = self.side_margin
 142
 143         # calculate margins in mm for browsers
 144         self.margins = []
 145         for m, clip in ((self.top_margin, clipy),
 146                         (self.side_margin, clipx + 0.5 * self.gutter),
 147                         (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
 148                         (self.side_margin, clipx + 0.5 * self.gutter),
 149                         ):
 150             self.margins.append((m + clip) * POINT_2_MM)
 151
 152         self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
 153         for x in locals().iteritems():
 154             log("%s: %s" % x, debug='PDFGEN')
 155         for x in dir(self):
 156             log("%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
 157
 158
 159
 160     def _webkit_command(self, html, pdf, outline=False):
 161         m = [str(x) for x in self.margins]
 162         outline_args = ['--outline'] * outline
 163         greyscale_args = ['-g'] * self.grey_scale
 164         cmd = ([config.WKHTMLTOPDF, '-q', '-s', self.papersize,
 165                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
 166                '-d', '100'] + outline_args + greyscale_args +
 167                config.WKHTMLTOPDF_EXTRA_COMMANDS + [html, pdf])
 168         log(' '.join(cmd))
 169         return cmd
 170
 171     def _gecko_command(self, html, pdf, outline=False):
 172         m = [str(x) for x in self.margins]
 173         #firefox -P pdfprint -print URL -printprinter "printer_settings"
 174         cmd = [config.FIREFOX, '-P', 'pdfprint', '-print',
 175                html, '-printprinter', self.moz_printer]
 176         log(' '.join(cmd))
 177         return cmd
 178
 179     def make_raw_pdf(self, html, pdf, engine='webkit', outline=False):
 180         func = getattr(self, '_%s_command' % engine)
 181         if self.columns == 1:
 182             cmd = func(html, pdf, outline=outline)
 183             run(cmd)
 184         else:
 185             printable_width = self.width - 2.0 * self.side_margin - self.gutter
 186             column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
 187             page_width = column_width + self.column_margin
 188             side_margin = self.column_margin * 0.5
 189
 190             columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
 191                                        gutter=0, top_margin=self.top_margin,
 192                                        side_margin=side_margin,
 193                                        bottom_margin=self.bottom_margin,
 194                                        grey_scale=self.grey_scale,
 195                                        )
 196
 197             column_pdf = pdf[:-4] + '-single-column.pdf'
 198             columnmaker.make_raw_pdf(html, column_pdf, engine=engine, outline=outline)
 199             columnmaker.reshape_pdf(column_pdf)
 200
 201             cmd = ['pdfnup',
 202                    '--nup', '%sx1' % int(self.columns),
 203                    '--paper', self.papersize.lower() + 'paper',
 204                    '--outfile', pdf,
 205                    '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
 206                    '--noautoscale', 'true',
 207                    '--orient', 'portrait',
 208                    #'--tidy', 'false',
 209                    column_pdf
 210                    ]
 211
 212             run(cmd)
 213
 214
 215
 216     def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
 217                     even_pages=True):
 218         """Spin the pdf for RTL text, resize it to the right size, and
 219         shift the gutter left and right"""
 220         ops = 'resize'
 221         if self.gutter:
 222             ops += ',shift'
 223         if even_pages:
 224             ops += ',even_pages'
 225         gutter = self.gutter
 226         if dir == 'RTL':
 227             gutter = -gutter
 228         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 229                'dir=%s' % dir,
 230                'filename=%s' % pdf,
 231                'output_filename=%s' % pdf,
 232                'operation=%s' % ops,
 233                'width=%s' % self.width,
 234                'height=%s' % self.height,
 235                'offset=%s' % gutter,
 236                'centre_start=%s' % centre_start,
 237                'centre_end=%s' % centre_end,
 238                ]
 239         run(cmd)
 240
 241     def _number_pdf(self, pdf, numbers='latin', dir='LTR',
 242                     number_start=1):
 243         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 244                'operation=page_numbers',
 245                'dir=%s' % dir,
 246                'filename=%s' % pdf,
 247                'output_filename=%s' % pdf,
 248                'number_start=%s' % number_start,
 249                'number_style=%s' % numbers,
 250                'number_bottom=%s' % self.number_bottom,
 251                'number_margin=%s' % self.number_margin,
 252                ]
 253         run(cmd)
 254
 255     def number_pdf(self, pdf, pages, **kwargs):
 256         # if there are too many pages for pdfedit to handle in one go,
 257         # split the job into bits.  <pages> may not be exact
 258         if pages is None or pages <= PDFEDIT_MAX_PAGES:
 259             self._number_pdf(pdf, **kwargs)
 260         else:
 261             # section_size must be even
 262             sections = pages // PDFEDIT_MAX_PAGES + 1
 263             section_size = (pages // sections + 2) & ~1
 264
 265             pdf_sections = []
 266             s = kwargs.pop('number_start', 1)
 267             while s < pages:
 268                 e = s + section_size - 1
 269                 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
 270                 if e < pages - 1:
 271                     page_range = '%s-%s' % (s, e)
 272                 else:
 273                     page_range = '%s-end' % s
 274                 run(['pdftk',
 275                      pdf,
 276                      'cat',
 277                      page_range,
 278                      'output',
 279                      pdf_section,
 280                      ])
 281                 self._number_pdf(pdf_section, number_start=s, **kwargs)
 282                 pdf_sections.append(pdf_section)
 283                 s = e + 1
 284
 285             concat_pdfs(pdf, *pdf_sections)
 286
 287     def make_barcode_pdf(self, isbn, pdf, corner='br'):
 288         """Put an ISBN barcode in a corner of a single blank page."""
 289
 290         position = '%s,%s,%s,%s,%s' %(corner, self.width, self.height, self.side_margin, self.bottom_margin)
 291         cmd1 = [config.BOOKLAND,
 292                 '--position', position,
 293                 str(isbn)]
 294         cmd2 = ['ps2pdf',
 295                 '-dFIXEDMEDIA',
 296                 '-dDEVICEWIDTHPOINTS=%s' % self.width,
 297                 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
 298                 '-', pdf]
 299
 300         p1 = Popen(cmd1, stdout=PIPE)
 301         p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
 302         out, err = p2.communicate()
 303
 304         log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
 305         log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
 306
 307
 308 def count_pdf_pages(pdf):
 309     """How many pages in the PDF?"""
 310     #XXX could also use python-pypdf or python-poppler
 311     cmd = ('pdfinfo', pdf)
 312     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 313     out, err = p.communicate()
 314     m = re.search(r'^\s*Pages:\s*(\d+)\s*$', re.MULTILINE)
 315     return int(m.group(1))
 316
 317
 318 def concat_pdfs(destination, *pdfs):
 319     """Join all the named pdfs together into one and save it as <name>"""
 320     cmd = ['pdftk']
 321     cmd.extend(x for x in pdfs if x is not None)
 322     cmd += ['cat', 'output', destination]
 323     run(cmd)
 324
 325 def index_pdf(pdf, text=None):
 326     """Use pdftotext to extract utf-8 text from a pdf, using ^L to
 327     separate pages."""
 328     if text is None:
 329         text = pdf + '.index.txt'
 330     cmd = ['pdftotext',
 331            #'-layout', #keeps more original formatting
 332            pdf,
 333            text]
 334     run(cmd)
 335     return text
 336
 337 def rotate_pdf(pdfin, pdfout):
 338     """Turn the PDF on its head"""
 339     cmd = ['pdftk', pdfin,
 340            'cat',
 341            '1-endD',
 342            'output',
 343            pdfout
 344            ]
 345     run(cmd)
 346
 347 def parse_outline(pdf, level_threshold):
 348     """Create a structure reflecting the outline of a PDF.
 349     A chapter heading looks like this:
 350
 351     BookmarkTitle: 2. What is sound?
 352     BookmarkLevel: 1
 353     BookmarkPageNumber: 3
 354     """
 355     cmd = ('pdftk', pdf, 'dump_data')
 356     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 357     outline, err = p.communicate()
 358     lines = (x.strip() for x in outline.split('\n') if x.strip())
 359     contents = []
 360
 361     def extract(expected, conv=str.strip):
 362         line = lines.next()
 363         try:
 364             k, v = line.split(':', 1)
 365             if k == expected:
 366                 return conv(v)
 367         except ValueError:
 368             log("trouble with line %r" %line)
 369
 370     #There are a few useless variables, then the pagecount, then the contents.
 371     #The pagecount is useful, so pick it up first.
 372     page_count = None
 373     while page_count == None:
 374         page_count = extract('NumberOfPages', int)
 375
 376     try:
 377         while True:
 378             title = extract('BookmarkTitle')
 379             if title is not None:
 380                 level = extract('BookmarkLevel', int)
 381                 pagenum = extract('BookmarkPageNumber', int)
 382                 if level <= level_threshold and None not in (level, pagenum):
 383                     contents.append((title, level, pagenum))
 384     except StopIteration:
 385         pass
 386
 387     return contents, outline, page_count
 388
 389
 390 class Book(object):
 391     page_numbers = 'latin'
 392     preamble_page_numbers = 'roman'
 393     engine= 'webkit'
 394     _try_cleanup_on_del = config.TRY_BOOK_CLEANUP_ON_DEL
 395
 396     def notify_watcher(self, message=None):
 397         if self.watcher:
 398             if  message is None:
 399                 #message is the name of the caller
 400                 #XXX look at using inspect module
 401                 import traceback
 402                 message = traceback.extract_stack(None, 2)[0][2]
 403             log("notify_watcher called with '%s'" % message)
 404             self.watcher(message)
 405
 406     def __enter__(self):
 407         return self
 408
 409     def __exit__(self, exc_type, exc_value, traceback):
 410         self.cleanup()
 411         #could deal with exceptions here and return true
 412
 413     def __init__(self, book, server, bookname,
 414                  page_settings=None, engine=None, watcher=None, isbn=None,
 415                  license=config.DEFAULT_LICENSE):
 416         log("*** Starting new book %s ***" % bookname)
 417         self.book = book
 418         self.server = server
 419         self.watcher = watcher
 420         self.isbn = isbn
 421         self.license = license
 422         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 423         os.chmod(self.workdir, 0755)
 424         defaults = SERVER_DEFAULTS[server]
 425         self.lang = defaults['lang']
 426         self.dir  = defaults['dir']
 427
 428         self.body_html_file = self.filepath('body.html')
 429         self.body_pdf_file = self.filepath('body.pdf')
 430         self.body_index_file = self.filepath('body.txt')
 431         self.preamble_html_file = self.filepath('preamble.html')
 432         self.preamble_pdf_file = self.filepath('preamble.pdf')
 433         self.tail_html_file = self.filepath('tail.html')
 434         self.tail_pdf_file = self.filepath('tail.pdf')
 435         self.isbn_pdf_file = None
 436         self.pdf_file = self.filepath('final.pdf')
 437
 438         self.publish_name = bookname
 439         self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
 440         self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
 441
 442         self.book_url = config.BOOK_URL % (self.server, self.book)
 443         self.toc_url = config.TOC_URL % (self.server, self.book)
 444
 445         self.maker = PageSettings(**page_settings)
 446
 447         if engine is not None:
 448             self.engine = engine
 449         self.notify_watcher()
 450
 451     if config.TRY_BOOK_CLEANUP_ON_DEL:
 452         #Dont even define __del__ if it is not used.
 453         _try_cleanup_on_del = True
 454         def __del__(self):
 455             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 456                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 457                 self.cleanup()
 458
 459     def __getattr__(self, attr):
 460         """catch unloaded books and load them"""
 461         #log('looking for missing attribute "%s"' % (attr))
 462         if attr == 'tree':
 463             self.load_book()
 464             return self.tree
 465         if attr == 'toc':
 466             self.load_toc()
 467             return self.toc
 468         raise AttributeError("no such member: '%s'" % attr)
 469
 470
 471     def filepath(self, fn):
 472         return os.path.join(self.workdir, fn)
 473
 474     def save_data(self, fn, data):
 475         """Save without tripping up on unicode"""
 476         if isinstance(data, unicode):
 477             data = data.encode('utf8', 'ignore')
 478         f = open(fn, 'w')
 479         f.write(data)
 480         f.close()
 481
 482     def save_tempfile(self, fn, data):
 483         """Save the data in a temporary directory that will be cleaned
 484         up when all is done.  Return the absolute file path."""
 485         fn = self.filepath(fn)
 486         self.save_data(fn, data)
 487         return fn
 488
 489     def extract_pdf_outline(self):
 490         self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
 491         for x in self.outline_contents:
 492             log(x)
 493         return number_of_pages
 494
 495     def make_body_pdf(self):
 496         """Make a pdf of the HTML, using webkit"""
 497         #1. Save the html
 498         html_text = lxml.etree.tostring(self.tree, method="html")
 499         self.save_data(self.body_html_file, html_text)
 500
 501         #2. Make a pdf of it
 502         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file,
 503                                 engine=self.engine, outline=True)
 504         self.notify_watcher('generate_pdf')
 505
 506         n_pages = self.extract_pdf_outline()
 507
 508         log ("found %s pages in pdf" % n_pages)
 509         #4. resize pages, shift gutters, even pages
 510         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 511         self.notify_watcher('reshape_pdf')
 512
 513         #5 add page numbers
 514         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 515                               numbers=self.page_numbers)
 516         self.notify_watcher("number_pdf")
 517         self.notify_watcher()
 518
 519     def make_preamble_pdf(self):
 520         contents = self.make_contents()
 521         inside_cover_html = self.compose_inside_cover()
 522         html = ('<html dir="%s"><head>\n'
 523                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 524                 '<link rel="stylesheet" href="%s" />\n'
 525                 '</head>\n<body>\n'
 526                 '<h1 class="frontpage">%s</h1>'
 527                 '%s\n'
 528                 '<div class="contents">%s</div>\n'
 529                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 530                 '<!--%s--></div></body></html>'
 531                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 532                      contents, self.title)
 533         self.save_data(self.preamble_html_file, html)
 534
 535         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file,
 536                                 engine=self.engine)
 537
 538         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 539
 540         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 541                             numbers=self.preamble_page_numbers,
 542                             number_start=-2)
 543
 544         self.notify_watcher()
 545
 546     def make_end_matter_pdf(self):
 547         """Make an inside back cover and a back cover.  If there is an
 548         isbn number its barcode will be put on the back cover."""
 549         if self.isbn:
 550             self.isbn_pdf_file = self.filepath('isbn.pdf')
 551             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 552             self.notify_watcher('make_barcode_pdf')
 553
 554         self.save_data(self.tail_html_file, self.compose_end_matter())
 555         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file,
 556                                 engine=self.engine)
 557
 558         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 559                                centre_end=True, even_pages=False)
 560         self.notify_watcher()
 561
 562     def make_book_pdf(self):
 563         """A convenient wrapper of a few necessary steps"""
 564         # now the Xvfb server is needed. make sure it has had long enough to get going
 565         self.wait_for_xvfb()
 566         self.make_body_pdf()
 567         self.make_preamble_pdf()
 568         self.make_end_matter_pdf()
 569
 570         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 571                     self.body_pdf_file, self.tail_pdf_file,
 572                     self.isbn_pdf_file)
 573
 574         self.notify_watcher('concatenated_pdfs')
 575
 576
 577     def make_simple_pdf(self, mode):
 578         """Make a simple pdf document without contents or separate
 579         title page.  This is used for multicolumn newspapers and for
 580         web-destined pdfs."""
 581         self.wait_for_xvfb()
 582         #0. Add heading to begining of html
 583         body = list(self.tree.cssselect('body'))[0]
 584         e = body.makeelement('h1', {'id': 'book-title'})
 585         e.text = self.title
 586         body.insert(0, e)
 587         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 588         e.addnext(intro)
 589
 590         #0.5 adjust parameters to suit the particular kind of output
 591         if mode == 'web':
 592             self.maker.gutter = 0
 593
 594         #1. Save the html
 595         html_text = lxml.etree.tostring(self.tree, method="html")
 596         self.save_data(self.body_html_file, html_text)
 597
 598         #2. Make a pdf of it (direct to to final pdf)
 599         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file,
 600                                 engine=self.engine, outline=True)
 601         self.notify_watcher('generate_pdf')
 602         n_pages = self.extract_pdf_outline()
 603
 604         if mode != 'web':
 605             #3. resize pages and shift gutters.
 606             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 607             self.notify_watcher('reshape_pdf')
 608
 609             #4. add page numbers
 610             self.maker.number_pdf(self.pdf_file, n_pages,
 611                                   dir=self.dir, numbers=self.page_numbers)
 612             self.notify_watcher("number_pdf")
 613         self.notify_watcher()
 614
 615
 616     def rotate180(self):
 617         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 618         presses."""
 619         rotated = self.filepath('final-rotate.pdf')
 620         unrotated = self.filepath('final-pre-rotate.pdf')
 621         #leave the unrotated pdf intact at first, in case of error.
 622         rotate_pdf(self.pdf_file, rotated)
 623         os.rename(self.pdf_file, unrotated)
 624         os.rename(rotated, self.pdf_file)
 625         self.notify_watcher()
 626
 627     def publish_pdf(self):
 628         """Move the finished PDF to its final resting place"""
 629         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 630         os.rename(self.pdf_file, self.publish_file)
 631         self.notify_watcher()
 632
 633     def load_toc(self):
 634         """From the TOC.txt file create a list of TocItems with
 635         the attributes <status>, <chapter>, and <title>.
 636
 637         <status> is a number, with the following meaning:
 638
 639               0 - section heading with no chapter
 640               1 - chapter heading
 641               2 - book title
 642
 643         The TocItem object has convenience functions <is_chapter> and
 644         <is_section>.
 645
 646         <chapter> is twiki name of the chapter.
 647
 648         <title> is a human readable title for the chapter.  It is likely to
 649         differ from the title given in the chapter's <h1> heading.
 650         """
 651         f = urlopen(self.toc_url)
 652         self.toc = []
 653         while True:
 654             try:
 655                 self.toc.append(TocItem(f.next().strip(),
 656                                         f.next().strip(),
 657                                         f.next().strip()))
 658             except StopIteration:
 659                 break
 660         f.close()
 661         self.notify_watcher()
 662
 663     def load_book(self, tidy=True):
 664         """Fetch and parse the raw html of the book.  If tidy is true
 665         (default) links in the document will be made absolute."""
 666         f = urlopen(self.book_url)
 667         html = f.read()
 668         f.close()
 669         html = ('<html dir="%s"><head>\n<title>%s</title>\n'
 670                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 671                 '</head>\n<body>\n'
 672                 '%s\n'
 673                 '<div style="page-break-before: always; color:#fff;" class="unseen">'
 674                 'A FLOSSManuals book</div>\n</body></html>'
 675                 ) % (self.dir, self.book, html)
 676
 677         self.save_tempfile('raw.html', html)
 678
 679         tree = lxml.html.document_fromstring(html)
 680         if tidy:
 681             tree.make_links_absolute(self.book_url)
 682         self.tree = tree
 683         self.headings = [x for x in tree.cssselect('h1')]
 684         if self.headings:
 685             self.headings[0].set('class', "first-heading")
 686         for h1 in self.headings:
 687             h1.title = h1.text_content().strip()
 688         self.notify_watcher()
 689
 690     def load(self):
 691         """Wrapper around all necessary load methods."""
 692         self.load_book()
 693         self.load_toc()
 694
 695     def make_contents(self):
 696         """Generate HTML containing the table of contents.  This can
 697         only be done after the main PDF has been made."""
 698         header = '<h1>Table of Contents</h1><table class="toc">\n'
 699         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 700                     '<td class="pagenumber">%s</td></tr>\n')
 701         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 702         footer = '\n</table>'
 703
 704         contents = []
 705
 706         chapter = 1
 707         page_num = 1
 708         subsections = [] # for the subsection heading pages.
 709
 710         outline_contents = iter(self.outline_contents)
 711         headings = iter(self.headings)
 712
 713         for t in self.toc:
 714             if t.is_chapter():
 715                 try:
 716                     h1 = headings.next()
 717                 except StopIteration:
 718                     log("heading not found for %s (previous h1 missing?). Stopping" % t)
 719                     break
 720                 h1_text, level, page_num = outline_contents.next()
 721                 log("%r %r" % (h1.title, h1_text))
 722                 contents.append(row_tmpl % (chapter, h1.title, page_num))
 723                 chapter += 1
 724             elif t.is_section():
 725                 contents.append(section_tmpl % t.title)
 726             else:
 727                 log("mystery TOC item: %s" % t)
 728
 729         doc = header + '\n'.join(contents) + footer
 730         self.notify_watcher()
 731         return doc
 732
 733     def add_section_titles(self):
 734         """Add any section heading pages that the TOC.txt file
 735         specifies.  These are sub-book, super-chapter groupings.
 736
 737         Also add initial numbers to chapters.
 738         """
 739         headings = iter(self.headings)
 740         chapter = 1
 741         section = None
 742
 743         for t in self.toc:
 744             if t.is_chapter() and section is not None:
 745                 try:
 746                     h1 = headings.next()
 747                 except StopIteration:
 748                     log("heading not found for %s (previous h1 missing?)" % t)
 749                     break
 750                 item = h1.makeelement('div', Class='chapter')
 751                 log(h1.title, debug='HTMLGEN')
 752                 item.text = h1.title
 753                 _add_initial_number(item, chapter)
 754
 755                 section.append(item)
 756
 757                 if not section_placed:
 758                     log("placing section", debug='HTMLGEN')
 759                     h1.addprevious(section)
 760                     section_placed = True
 761                 else:
 762                     log("NOT placing section", debug='HTMLGEN')
 763
 764                 #put a bold number at the beginning of the h1.
 765                 _add_initial_number(h1, chapter)
 766                 chapter += 1
 767
 768             elif t.is_section():
 769                 section = self.tree.makeelement('div', Class="subsection")
 770                 # section Element complains when you try to ask it whether it
 771                 # has been placed (though it does know)
 772                 section_placed = False
 773                 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
 774                 heading.set("Class", "subsection-heading")
 775                 section.append(heading)
 776
 777         self.notify_watcher()
 778
 779
 780     def add_css(self, css=None, mode='book'):
 781         """If css looks like a url, use it as a stylesheet link.
 782         Otherwise it is the CSS itself, which is saved to a temporary file
 783         and linked to."""
 784         log("css is %r" % css)
 785         htmltree = self.tree
 786         if css is None or not css.strip():
 787             defaults = SERVER_DEFAULTS[self.server]
 788             url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
 789         elif not re.match(r'^http://\S+$', css):
 790             fn = self.save_tempfile('objavi.css', css)
 791             url = 'file://' + fn
 792         else:
 793             url = css
 794         #XXX for debugging and perhaps sensible anyway
 795         #url = url.replace('file:///home/douglas/objavi2', '')
 796
 797
 798         #find the head -- it's probably first child but lets not assume.
 799         for child in htmltree:
 800             if child.tag == 'head':
 801                 head = child
 802                 break
 803         else:
 804             head = htmltree.makeelement('head')
 805             htmltree.insert(0, head)
 806
 807         link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 808         self.css_url = url
 809         self.notify_watcher()
 810         return url
 811
 812     def set_title(self, title=None):
 813         """If a string is supplied, it becomes the book's title.
 814         Otherwise a guess is made."""
 815         if title:
 816             self.title = title
 817         else:
 818             titles = [x.text_content() for x in self.tree.cssselect('title')]
 819             if titles and titles[0]:
 820                 self.title = titles[0]
 821             else:
 822                 #oh well
 823                 self.title = 'A Manual About ' + self.book
 824         return self.title
 825
 826     def _read_localised_template(self, template, fallbacks=['en']):
 827         """Try to get the template in the approriate language, otherwise in english."""
 828         for lang in [self.lang] + fallbacks:
 829             try:
 830                 fn = template % (lang)
 831                 f = open(fn)
 832                 break
 833             except IOError, e:
 834                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 835                 log(e)
 836         template = f.read()
 837         f.close()
 838         return template
 839
 840     def compose_inside_cover(self):
 841         """create the markup for the preamble inside cover."""
 842         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 843
 844         if self.isbn:
 845             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 846         else:
 847             isbn_text = ''
 848
 849         return template % {'date': time.strftime('%Y-%m-%d'),
 850                            'isbn': isbn_text,
 851                            'license': self.license,
 852                            }
 853
 854
 855     def compose_end_matter(self):
 856         """create the markup for the end_matter inside cover.  If
 857         self.isbn is not set, the html will result in a pdf that
 858         spills onto two pages.
 859         """
 860         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 861
 862         d = {'css_url': self.css_url,
 863              'title': self.title
 864              }
 865
 866         if self.isbn:
 867             d['inside_cover_style'] = ''
 868         else:
 869             d['inside_cover_style'] = 'page-break-after: always'
 870
 871         return template % d
 872
 873
 874
 875
 876     def spawn_x(self):
 877         """Start an Xvfb instance, using a new server number.  A
 878         reference to it is stored in self.xvfb, which is used to kill
 879         it when the pdf is done.
 880
 881         Note that Xvfb doesn't interact well with dbus which is
 882         present on modern desktops.
 883         """
 884         #Find an unused server number (in case two cgis are running at once)
 885         while True:
 886             servernum = random.randrange(50, 500)
 887             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 888                 break
 889
 890         self.xserver_no = ':%s' % servernum
 891
 892         authfile = self.filepath('Xauthority')
 893         os.environ['XAUTHORITY'] = authfile
 894
 895         #mcookie(1) eats into /dev/random, so avoid that
 896         from hashlib import md5
 897         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 898         mcookie = m.hexdigest()
 899
 900         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 901
 902         self.xvfb = Popen(['Xvfb', self.xserver_no,
 903                            '-screen', '0', '1024x768x24',
 904                            '-pixdepths', '32',
 905                            #'-blackpixel', '0',
 906                            #'-whitepixel', str(2 ** 24 -1),
 907                            #'+extension', 'Composite',
 908                            '-dpi', '96',
 909                            '-kb',
 910                            '-nolisten', 'tcp',
 911                            ])
 912
 913         # We need to wait a bit before the Xvfb is ready.  but the
 914         # downloads are so slow that that probably doesn't matter
 915
 916         self.xvfb_ready_time = time.time() + 2
 917
 918         os.environ['DISPLAY'] = self.xserver_no
 919         log(self.xserver_no)
 920
 921     def wait_for_xvfb(self):
 922         """wait until a previously set time before continuing.  This
 923         is so Xvfb has time to properly start."""
 924         if hasattr(self, 'xvfb'):
 925             d = self.xvfb_ready_time - time.time()
 926             if d > 0:
 927                 time.sleep(d)
 928                 self.notify_watcher()
 929
 930     def cleanup_x(self):
 931         """Try very hard to kill off Xvfb.  In addition to killing
 932         this instance's xvfb, occasionally (randomly) search for
 933         escaped Xvfb instances and kill those too."""
 934         if not hasattr(self, 'xvfb'):
 935             return
 936         check_call(['xauth', 'remove', self.xserver_no])
 937         p = self.xvfb
 938         log("trying to kill Xvfb %s" % p.pid)
 939         os.kill(p.pid, 15)
 940         for i in range(10):
 941             if p.poll() is not None:
 942                 log("%s died with %s" % (p.pid, p.poll()))
 943                 break
 944             log("%s not dead yet" % p.pid)
 945             time.sleep(0.2)
 946         else:
 947             log("Xvfb would not die! kill -9! kill -9!")
 948             os.kill(p.pid, 9)
 949
 950         if random.random() < 0.05:
 951             #kill old xvfbs occasionally, if there are any.
 952             self.kill_old_xvfbs()
 953
 954     def kill_old_xvfbs(self):
 955         """Sometimes, despite everything, Xvfb instances hang around
 956         well after they are wanted -- for example if the cgi process
 957         dies particularly badly. So kill them if they have been
 958         running for a long time."""
 959         log("running kill_old_xvfbs")
 960         p = Popen(['ps', '-C' 'Xvfb', '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 961         data = p.communicate()[0].strip()
 962         if data:
 963             lines = data.split('\n')
 964             for line in lines:
 965                 log('dealing with ps output "%s"' % line)
 966                 try:
 967                     pid, days_, hours, minutes, seconds = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$').groups()
 968                 except AttributeError:
 969                     log("Couldn't parse that line!")
 970                 # 50 minutes should be enough xvfb time for anyone
 971                 if days or hours or int(minutes) > 50:
 972                     log("going to kill pid %s" % pid)
 973                     os.kill(int(pid), 15)
 974                     time.sleep(0.5)
 975                     os.kill(int(pid), 9)
 976         self.notify_watcher()
 977
 978     def cleanup(self):
 979         self.cleanup_x()
 980         if not config.KEEP_TEMP_FILES:
 981             for fn in os.listdir(self.workdir):
 982                 os.remove(os.path.join(self.workdir, fn))
 983             os.rmdir(self.workdir)
 984         else:
 985             log("NOT removing '%s', containing the following files:" % self.workdir)
 986             log(*os.listdir(self.workdir))
 987
 988         self.notify_watcher()
 989
 990