fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This provides abstractions of texts and virtual printers and manages
   3 # their interactions.
   4 #
   5 # Copyright (C) 2009 Douglas Bagnall
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with this program; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20
  21 """Library module representing a complete FM book being turned into a
  22 PDF"""
  23
  24 import os, sys
  25 import tempfile
  26 import re, time
  27 import random
  28 from urllib2 import urlopen
  29 from subprocess import Popen, check_call, PIPE
  30
  31 import lxml.etree, lxml.html
  32 import lxml, lxml.html, lxml.etree
  33
  34 import config
  35 from config import SERVER_DEFAULTS, DEFAULT_SERVER, POINT_2_MM, PDFEDIT_MAX_PAGES
  36
  37 TMPDIR = os.path.abspath(config.TMPDIR)
  38 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  39 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  40
  41
  42 def log(*messages, **kwargs):
  43     """Send the messages to the appropriate place (stderr, or syslog).
  44     If a <debug> keyword is specified, the message is only printed if
  45     its value ias in the global DEBUG_MODES."""
  46     if 'debug' not in kwargs or config.DEBUG_ALL or kwargs['debug'] in config.DEBUG_MODES:
  47         for m in messages:
  48             try:
  49                 print >> sys.stderr, m
  50             except Exception:
  51                 print >> sys.stderr, repr(m)
  52
  53 def _add_initial_number(e, n):
  54     """Put a styled chapter number n at the beginning of element e."""
  55     initial = e.makeelement("strong", Class="initial")
  56     e.insert(0, initial)
  57     initial.tail = ' '
  58     if e.text is not None:
  59         initial.tail += e.text
  60     e.text = ''
  61     initial.text = "%s." % n
  62
  63
  64 class TocItem(object):
  65     """This makes sense of the tuples from TOC.txt files"""
  66     def __init__(self, status, chapter, title):
  67         # status is
  68         #  0 - section heading with no chapter
  69         #  1 - chapter heading
  70         #  2 - book title
  71         #
  72         # chapter is twiki name of the chapter
  73         # title is a human readable name of the chapter.
  74         self.status = status
  75         self.chapter = chapter
  76         self.title = title
  77
  78     def is_chapter(self):
  79         return self.status == '1'
  80
  81     def is_section(self):
  82         return self.status == '0'
  83
  84     def __str__(self):
  85         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
  86
  87
  88 def run(cmd):
  89     try:
  90         p = Popen(cmd, stdout=PIPE, stderr=PIPE)
  91         out, err = p.communicate()
  92     except Exception:
  93         log("Failed on command: %r" % cmd)
  94         raise
  95     log("%s\n%s returned %s and produced\nstdout:%s\nstderr:%s" %
  96         (' '.join(cmd), cmd[0], p.poll(), out, err))
  97
  98
  99 def find_containing_paper(w, h):
 100     size = None
 101     for name, pw, ph in config.PAPER_SIZES:
 102         if pw >= w and ph >= h:
 103             mw = (pw - w) * 0.5
 104             mh = (ph - h) * 0.5
 105             return (name, mw, mh)
 106
 107     raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
 108                      (w * POINT_2_MM, h * POINT_2_MM))
 109
 110
 111
 112 class PageSettings(object):
 113     """Calculates and wraps commands for the generation and processing
 114     of PDFs"""
 115     def __init__(self, pointsize, **kwargs):
 116         # the formulas for default gutters, margins and column margins
 117         # are quite ad-hoc and certainly improvable.
 118
 119         self.width, self.height = pointsize
 120         self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
 121         self.grey_scale = 'grey_scale' in kwargs
 122
 123         self.gutter = kwargs.get('gutter', (config.BASE_GUTTER +
 124                                             config.PROPORTIONAL_GUTTER * self.width))
 125
 126         default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
 127         self.top_margin = kwargs.get('top_margin', default_margin)
 128         self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
 129         self.side_margin = kwargs.get('side_margin', default_margin)
 130         self.bottom_margin = kwargs.get('bottom_margin', default_margin)
 131         self.columns = kwargs.get('columns', 1)
 132
 133         self.column_margin = kwargs.get('column_margin', default_margin * 2 / (4.0 + self.columns))
 134
 135         self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
 136         self.number_margin = self.side_margin
 137
 138         # calculate margins in mm for browsers
 139         self.margins = []
 140         for m, clip in ((self.top_margin, clipy),
 141                         (self.side_margin, clipx + 0.5 * self.gutter),
 142                         (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
 143                         (self.side_margin, clipx + 0.5 * self.gutter),
 144                         ):
 145             if m is None:
 146                 m = default_margin
 147             self.margins.append((m + clip) * POINT_2_MM)
 148
 149         for x in locals().iteritems():
 150             log("%s: %s" % x, debug='PDFGEN')
 151         for x in dir(self):
 152             log("%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
 153
 154
 155
 156     def _webkit_command(self, html, pdf, outline=False):
 157         m = [str(x) for x in self.margins]
 158         outline_args = ['--outline'] * outline
 159         greyscale_args = ['-g'] * self.grey_scale
 160         cmd = ([config.WKHTMLTOPDF, '-q', '-s', self.papersize,
 161                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
 162                '-d', '100'] + outline_args + greyscale_args +
 163                config.WKHTMLTOPDF_EXTRA_COMMANDS + [html, pdf])
 164         log(' '.join(cmd))
 165         return cmd
 166
 167     def _gecko_command(self, html, pdf, outline=False):
 168         m = [str(x) for x in self.margins]
 169         #firefox -P pdfprint -print URL -printprinter "printer_settings"
 170         cmd = [config.FIREFOX, '-P', 'pdfprint', '-print',
 171                html, '-printprinter', self.moz_printer]
 172         log(' '.join(cmd))
 173         return cmd
 174
 175     def make_raw_pdf(self, html, pdf, engine='webkit', outline=False):
 176         func = getattr(self, '_%s_command' % engine)
 177         if self.columns == 1:
 178             cmd = func(html, pdf, outline=outline)
 179             run(cmd)
 180         else:
 181             printable_width = self.width - 2.0 * self.side_margin - self.gutter
 182             column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
 183             page_width = column_width + self.column_margin
 184
 185             columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
 186                                        gutter=0, top_margin=self.top_margin,
 187                                        side_margin=self.column_margin * 0.5,
 188                                        bottom_margin=self.bottom_margin)
 189
 190             column_pdf = pdf[:-4] + '-single-column.pdf'
 191             columnmaker.make_raw_pdf(html, column_pdf, engine=engine, outline=outline)
 192             columnmaker.reshape_pdf(column_pdf)
 193
 194             cmd = ['pdfnup',
 195                    '--nup', '%sx1' % int(self.columns),
 196                    '--paper', self.papersize.lower() + 'paper',
 197                    '--outfile', pdf,
 198                    '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
 199                    '--noautoscale', 'true',
 200                    '--orient', 'portrait',
 201                    #'--tidy', 'false',
 202                    column_pdf
 203                    ]
 204
 205             run(cmd)
 206
 207
 208
 209     def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
 210                     even_pages=True):
 211         """Spin the pdf for RTL text, resize it to the right size, and
 212         shift the gutter left and right"""
 213         ops = 'resize'
 214         if self.gutter:
 215             ops += ',shift'
 216         if even_pages:
 217             ops += ',even_pages'
 218         gutter = self.gutter
 219         if dir == 'RTL':
 220             gutter = -gutter
 221         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 222                'dir=%s' % dir,
 223                'filename=%s' % pdf,
 224                'output_filename=%s' % pdf,
 225                'operation=%s' % ops,
 226                'width=%s' % self.width,
 227                'height=%s' % self.height,
 228                'offset=%s' % gutter,
 229                'centre_start=%s' % centre_start,
 230                'centre_end=%s' % centre_end,
 231                ]
 232         run(cmd)
 233
 234     def _number_pdf(self, pdf, numbers='latin', dir='LTR',
 235                     number_start=1):
 236         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 237                'operation=page_numbers',
 238                'dir=%s' % dir,
 239                'filename=%s' % pdf,
 240                'output_filename=%s' % pdf,
 241                'number_start=%s' % number_start,
 242                'number_style=%s' % numbers,
 243                'number_bottom=%s' % self.number_bottom,
 244                'number_margin=%s' % self.number_margin,
 245                ]
 246         run(cmd)
 247
 248     def number_pdf(self, pdf, pages, **kwargs):
 249         # if there are too many pages for pdfedit to handle in one go,
 250         # split the job into bits.  <pages> may not be exact
 251         if pages is None or pages <= PDFEDIT_MAX_PAGES:
 252             self._number_pdf(pdf, **kwargs)
 253         else:
 254             # section_size must be even
 255             sections = pages // PDFEDIT_MAX_PAGES + 1
 256             section_size = (pages // sections + 2) & ~1
 257
 258             pdf_sections = []
 259             s = kwargs.pop('number_start', 1)
 260             while s < pages:
 261                 e = s + section_size - 1
 262                 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
 263                 if e < pages - 1:
 264                     page_range = '%s-%s' % (s, e)
 265                 else:
 266                     page_range = '%s-end' % s
 267                 run(['pdftk',
 268                      pdf,
 269                      'cat',
 270                      page_range,
 271                      'output',
 272                      pdf_section,
 273                      ])
 274                 self._number_pdf(pdf_section, number_start=s, **kwargs)
 275                 pdf_sections.append(pdf_section)
 276                 s = e + 1
 277
 278             concat_pdfs(pdf, *pdf_sections)
 279
 280     def make_barcode_pdf(self, isbn, pdf, corner='br'):
 281         """Put an ISBN barcode in a corner of a single blank page."""
 282
 283         position = '%s,%s,%s,%s,%s' %(corner, self.width, self.height, self.side_margin, self.bottom_margin)
 284         cmd1 = [config.BOOKLAND,
 285                 '--position', position,
 286                 str(isbn)]
 287         cmd2 = ['ps2pdf',
 288                 '-dFIXEDMEDIA',
 289                 '-dDEVICEWIDTHPOINTS=%s' % self.width,
 290                 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
 291                 '-', pdf]
 292
 293         p1 = Popen(cmd1, stdout=PIPE)
 294         p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
 295         out, err = p2.communicate()
 296
 297         log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
 298         log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
 299
 300
 301 def count_pdf_pages(pdf):
 302     """How many pages in the PDF?"""
 303     #XXX could also use python-pypdf or python-poppler
 304     cmd = ('pdfinfo', pdf)
 305     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 306     out, err = p.communicate()
 307     m = re.search(r'^\s*Pages:\s*(\d+)\s*$', re.MULTILINE)
 308     return int(m.group(1))
 309
 310
 311 def concat_pdfs(destination, *pdfs):
 312     """Join all the named pdfs together into one and save it as <name>"""
 313     cmd = ['pdftk']
 314     cmd.extend(x for x in pdfs if x is not None)
 315     cmd += ['cat', 'output', destination]
 316     run(cmd)
 317
 318 def index_pdf(pdf, text=None):
 319     """Use pdftotext to extract utf-8 text from a pdf, using ^L to
 320     separate pages."""
 321     if text is None:
 322         text = pdf + '.index.txt'
 323     cmd = ['pdftotext',
 324            #'-layout', #keeps more original formatting
 325            pdf,
 326            text]
 327     run(cmd)
 328     return text
 329
 330 def rotate_pdf(pdfin, pdfout):
 331     """Turn the PDF on its head"""
 332     cmd = ['pdftk', pdfin,
 333            'cat',
 334            '1-endD',
 335            'output',
 336            pdfout
 337            ]
 338     run(cmd)
 339
 340 def parse_outline(pdf, level_threshold):
 341     """Create a structure reflecting the outline of a PDF.
 342     A chapter heading looks like this:
 343
 344     BookmarkTitle: 2. What is sound?
 345     BookmarkLevel: 1
 346     BookmarkPageNumber: 3
 347     """
 348     cmd = ('pdftk', pdf, 'dump_data')
 349     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 350     outline, err = p.communicate()
 351     lines = (x.strip() for x in outline.split('\n') if x.strip())
 352     contents = []
 353
 354     def extract(expected, conv=str.strip):
 355         line = lines.next()
 356         try:
 357             k, v = line.split(':', 1)
 358             if k == expected:
 359                 return conv(v)
 360         except ValueError:
 361             log("trouble with line %r" %line)
 362
 363     #There are a few useless variables, then the pagecount, then the contents.
 364     #The pagecount is useful, so pick it up first.
 365     page_count = None
 366     while page_count == None:
 367         page_count = extract('NumberOfPages', int)
 368
 369     try:
 370         while True:
 371             title = extract('BookmarkTitle')
 372             if title is not None:
 373                 level = extract('BookmarkLevel', int)
 374                 pagenum = extract('BookmarkPageNumber', int)
 375                 if level <= level_threshold and None not in (level, pagenum):
 376                     contents.append((title, level, pagenum))
 377     except StopIteration:
 378         pass
 379
 380     return contents, outline, page_count
 381
 382
 383 class Book(object):
 384     page_numbers = 'latin'
 385     preamble_page_numbers = 'roman'
 386     engine= 'webkit'
 387     _try_cleanup_on_del = config.TRY_BOOK_CLEANUP_ON_DEL
 388
 389     def notify_watcher(self, message=None):
 390         if self.watcher:
 391             if  message is None:
 392                 #message is the name of the caller
 393                 #XXX look at using inspect module
 394                 import traceback
 395                 message = traceback.extract_stack(None, 2)[0][2]
 396             log("notify_watcher called with '%s'" % message)
 397             self.watcher(message)
 398
 399     def __enter__(self):
 400         return self
 401
 402     def __exit__(self, exc_type, exc_value, traceback):
 403         self.cleanup()
 404         #could deal with exceptions here and return true
 405
 406     def __init__(self, book, server, bookname,
 407                  page_settings=None, engine=None, watcher=None, isbn=None,
 408                  license=config.DEFAULT_LICENSE):
 409         log("*** Starting new book %s ***" % bookname)
 410         self.book = book
 411         self.server = server
 412         self.watcher = watcher
 413         self.isbn = isbn
 414         self.license = license
 415         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 416         os.chmod(self.workdir, 0755)
 417         defaults = SERVER_DEFAULTS[server]
 418         self.lang = defaults['lang']
 419         self.dir  = defaults['dir']
 420
 421         self.body_html_file = self.filepath('body.html')
 422         self.body_pdf_file = self.filepath('body.pdf')
 423         self.body_index_file = self.filepath('body.txt')
 424         self.preamble_html_file = self.filepath('preamble.html')
 425         self.preamble_pdf_file = self.filepath('preamble.pdf')
 426         self.tail_html_file = self.filepath('tail.html')
 427         self.tail_pdf_file = self.filepath('tail.pdf')
 428         self.isbn_pdf_file = None
 429         self.pdf_file = self.filepath('final.pdf')
 430
 431         self.publish_name = bookname
 432         self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
 433         self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
 434
 435         self.book_url = config.BOOK_URL % (self.server, self.book)
 436         self.toc_url = config.TOC_URL % (self.server, self.book)
 437
 438         self.maker = PageSettings(**page_settings)
 439
 440         if engine is not None:
 441             self.engine = engine
 442         self.notify_watcher()
 443
 444     def __del__(self):
 445         if self._try_cleanup_on_del and os.path.exists(self.workdir):
 446             self._try_cleanup_on_del = False #or else you can get in bad cycles
 447             self.cleanup()
 448
 449     def __getattr__(self, attr):
 450         """catch unloaded books and load them"""
 451         #log('looking for missing attribute "%s"' % (attr))
 452         if attr == 'tree':
 453             self.load_book()
 454             return self.tree
 455         if attr == 'toc':
 456             self.load_toc()
 457             return self.toc
 458         raise AttributeError("no such member: '%s'" % attr)
 459
 460
 461     def filepath(self, fn):
 462         return os.path.join(self.workdir, fn)
 463
 464     def save_data(self, fn, data):
 465         """Save without tripping up on unicode"""
 466         if isinstance(data, unicode):
 467             data = data.encode('utf8', 'ignore')
 468         f = open(fn, 'w')
 469         f.write(data)
 470         f.close()
 471
 472     def save_tempfile(self, fn, data):
 473         """Save the data in a temporary directory that will be cleaned
 474         up when all is done.  Return the absolute file path."""
 475         fn = self.filepath(fn)
 476         self.save_data(fn, data)
 477         return fn
 478
 479     def extract_pdf_outline(self):
 480         self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
 481         for x in self.outline_contents:
 482             log(x)
 483         return number_of_pages
 484
 485     def make_body_pdf(self):
 486         """Make a pdf of the HTML, using webkit"""
 487         #1. Save the html
 488         html_text = lxml.etree.tostring(self.tree, method="html")
 489         self.save_data(self.body_html_file, html_text)
 490
 491         #2. Make a pdf of it
 492         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file,
 493                                 engine=self.engine, outline=True)
 494         self.notify_watcher('generate_pdf')
 495
 496         n_pages = self.extract_pdf_outline()
 497
 498         log ("found %s pages in pdf" % n_pages)
 499         #4. resize pages, shift gutters, even pages
 500         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 501         self.notify_watcher('reshape_pdf')
 502
 503         #5 add page numbers
 504         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 505                               numbers=self.page_numbers)
 506         self.notify_watcher("number_pdf")
 507         self.notify_watcher()
 508
 509     def make_preamble_pdf(self):
 510         contents = self.make_contents()
 511         inside_cover_html = self.compose_inside_cover()
 512         html = ('<html dir="%s"><head>\n'
 513                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 514                 '<link rel="stylesheet" href="%s" />\n'
 515                 '</head>\n<body>\n'
 516                 '<h1 class="frontpage">%s</h1>'
 517                 '%s\n'
 518                 '<div class="contents">%s</div>\n'
 519                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 520                 '<!--%s--></div></body></html>'
 521                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 522                      contents, self.title)
 523         self.save_data(self.preamble_html_file, html)
 524
 525         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file,
 526                                 engine=self.engine)
 527
 528         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 529
 530         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 531                             numbers=self.preamble_page_numbers,
 532                             number_start=-2)
 533
 534         self.notify_watcher()
 535
 536     def make_end_matter_pdf(self):
 537         """Make an inside back cover and a back cover.  If there is an
 538         isbn number its barcode will be put on the back cover."""
 539         if self.isbn:
 540             self.isbn_pdf_file = self.filepath('isbn.pdf')
 541             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 542             self.notify_watcher('make_barcode_pdf')
 543
 544         self.save_data(self.tail_html_file, self.compose_end_matter())
 545         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file,
 546                                 engine=self.engine)
 547
 548         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 549                                centre_end=True, even_pages=False)
 550         self.notify_watcher()
 551
 552     def make_book_pdf(self):
 553         """A convenient wrapper of a few necessary steps"""
 554         # now the Xvfb server is needed. make sure it has had long enough to get going
 555         self.wait_for_xvfb()
 556         self.make_body_pdf()
 557         self.make_preamble_pdf()
 558         self.make_end_matter_pdf()
 559
 560         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 561                     self.body_pdf_file, self.tail_pdf_file,
 562                     self.isbn_pdf_file)
 563
 564         self.notify_watcher('concatenated_pdfs')
 565
 566
 567     def make_simple_pdf(self, mode):
 568         """Make a simple pdf document without contents or separate
 569         title page.  This is used for multicolumn newspapers and for
 570         web-destined pdfs."""
 571         self.wait_for_xvfb()
 572         #0. Add heading to begining of html
 573         body = list(self.tree.cssselect('body'))[0]
 574         e = body.makeelement('h1', {'id': 'book-title'})
 575         e.text = self.title
 576         body.insert(0, e)
 577         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 578         e.addnext(intro)
 579
 580         #0.5 adjust parameters to suit the particular kind of output
 581         if mode == 'web':
 582             self.maker.gutter = 0
 583
 584         #1. Save the html
 585         html_text = lxml.etree.tostring(self.tree, method="html")
 586         self.save_data(self.body_html_file, html_text)
 587
 588         #2. Make a pdf of it (direct to to final pdf)
 589         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file,
 590                                 engine=self.engine, outline=True)
 591         self.notify_watcher('generate_pdf')
 592         n_pages = self.extract_pdf_outline()
 593
 594         if mode != 'web':
 595             #3. resize pages and shift gutters.
 596             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 597             self.notify_watcher('reshape_pdf')
 598
 599             #4. add page numbers
 600             self.maker.number_pdf(self.pdf_file, n_pages,
 601                                   dir=self.dir, numbers=self.page_numbers)
 602             self.notify_watcher("number_pdf")
 603         self.notify_watcher()
 604
 605
 606     def rotate180(self):
 607         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 608         presses."""
 609         rotated = self.filepath('final-rotate.pdf')
 610         unrotated = self.filepath('final-pre-rotate.pdf')
 611         #leave the unrotated pdf intact at first, in case of error.
 612         rotate_pdf(self.pdf_file, rotated)
 613         os.rename(self.pdf_file, unrotated)
 614         os.rename(rotated, self.pdf_file)
 615         self.notify_watcher()
 616
 617     def publish_pdf(self):
 618         """Move the finished PDF to its final resting place"""
 619         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 620         os.rename(self.pdf_file, self.publish_file)
 621         self.notify_watcher()
 622
 623     def load_toc(self):
 624         """From the TOC.txt file create a list of TocItems with
 625         the attributes <status>, <chapter>, and <title>.
 626
 627         <status> is a number, with the following meaning:
 628
 629               0 - section heading with no chapter
 630               1 - chapter heading
 631               2 - book title
 632
 633         The TocItem object has convenience functions <is_chapter> and
 634         <is_section>.
 635
 636         <chapter> is twiki name of the chapter.
 637
 638         <title> is a human readable title for the chapter.  It is likely to
 639         differ from the title given in the chapter's <h1> heading.
 640         """
 641         f = urlopen(self.toc_url)
 642         self.toc = []
 643         while True:
 644             try:
 645                 self.toc.append(TocItem(f.next().strip(),
 646                                         f.next().strip(),
 647                                         f.next().strip()))
 648             except StopIteration:
 649                 break
 650         f.close()
 651         self.notify_watcher()
 652
 653     def load_book(self, tidy=True):
 654         """Fetch and parse the raw html of the book.  If tidy is true
 655         (default) links in the document will be made absolute."""
 656         f = urlopen(self.book_url)
 657         html = f.read()
 658         f.close()
 659         html = ('<html dir="%s"><head>\n<title>%s</title>\n'
 660                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 661                 '</head>\n<body>\n'
 662                 '%s\n'
 663                 '<div style="page-break-before: always; color:#fff;" class="unseen">'
 664                 'A FLOSSManuals book</div>\n</body></html>'
 665                 ) % (self.dir, self.book, html)
 666
 667         self.save_tempfile('raw.html', html)
 668
 669         tree = lxml.html.document_fromstring(html)
 670         if tidy:
 671             tree.make_links_absolute(self.book_url)
 672         self.tree = tree
 673         self.headings = [x for x in tree.cssselect('h1')]
 674         if self.headings:
 675             self.headings[0].set('class', "first-heading")
 676         for h1 in self.headings:
 677             h1.title = h1.text_content().strip()
 678         self.notify_watcher()
 679
 680     def load(self):
 681         """Wrapper around all necessary load methods."""
 682         self.load_book()
 683         self.load_toc()
 684
 685     def make_contents(self):
 686         """Generate HTML containing the table of contents.  This can
 687         only be done after the main PDF has been made."""
 688         header = '<h1>Table of Contents</h1><table class="toc">\n'
 689         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 690                     '<td class="pagenumber">%s</td></tr>\n')
 691         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 692         footer = '\n</table>'
 693
 694         contents = []
 695
 696         chapter = 1
 697         page_num = 1
 698         subsections = [] # for the subsection heading pages.
 699
 700         outline_contents = iter(self.outline_contents)
 701         headings = iter(self.headings)
 702
 703         for t in self.toc:
 704             if t.is_chapter():
 705                 try:
 706                     h1 = headings.next()
 707                 except StopIteration:
 708                     log("heading not found for %s (previous h1 missing?). Stopping" % t)
 709                     break
 710                 h1_text, level, page_num = outline_contents.next()
 711                 log("%r %r" % (h1.title, h1_text))
 712                 contents.append(row_tmpl % (chapter, h1.title, page_num))
 713                 chapter += 1
 714             elif t.is_section():
 715                 contents.append(section_tmpl % t.title)
 716             else:
 717                 log("mystery TOC item: %s" % t)
 718
 719         doc = header + '\n'.join(contents) + footer
 720         self.notify_watcher()
 721         return doc
 722
 723     def add_section_titles(self):
 724         """Add any section heading pages that the TOC.txt file
 725         specifies.  These are sub-book, super-chapter groupings.
 726
 727         Also add initial numbers to chapters.
 728         """
 729         headings = iter(self.headings)
 730         chapter = 1
 731         section = None
 732
 733         for t in self.toc:
 734             if t.is_chapter() and section is not None:
 735                 try:
 736                     h1 = headings.next()
 737                 except StopIteration:
 738                     log("heading not found for %s (previous h1 missing?)" % t)
 739                     break
 740                 item = h1.makeelement('div', Class='chapter')
 741                 log(h1.title, debug='HTMLGEN')
 742                 item.text = h1.title
 743                 _add_initial_number(item, chapter)
 744
 745                 section.append(item)
 746
 747                 if not section_placed:
 748                     log("placing section", debug='HTMLGEN')
 749                     h1.addprevious(section)
 750                     section_placed = True
 751                 else:
 752                     log("NOT placing section", debug='HTMLGEN')
 753
 754                 #put a bold number at the beginning of the h1.
 755                 _add_initial_number(h1, chapter)
 756                 chapter += 1
 757
 758             elif t.is_section():
 759                 section = self.tree.makeelement('div', Class="subsection")
 760                 # section Element complains when you try to ask it whether it
 761                 # has been placed (though it does know)
 762                 section_placed = False
 763                 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
 764                 heading.set("Class", "subsection-heading")
 765                 section.append(heading)
 766
 767         self.notify_watcher()
 768
 769
 770     def add_css(self, css=None, mode='book'):
 771         """If css looks like a url, use it as a stylesheet link.
 772         Otherwise it is the CSS itself, which is saved to a temporary file
 773         and linked to."""
 774         log("css is %r" % css)
 775         htmltree = self.tree
 776         if css is None or not css.strip():
 777             defaults = SERVER_DEFAULTS[self.server]
 778             url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
 779         elif not re.match(r'^http://\S+$', css):
 780             fn = self.save_tempfile('objavi.css', css)
 781             url = 'file://' + fn
 782         else:
 783             url = css
 784         #XXX for debugging and perhaps sensible anyway
 785         #url = url.replace('file:///home/douglas/objavi2', '')
 786
 787
 788         #find the head -- it's probably first child but lets not assume.
 789         for child in htmltree:
 790             if child.tag == 'head':
 791                 head = child
 792                 break
 793         else:
 794             head = htmltree.makeelement('head')
 795             htmltree.insert(0, head)
 796
 797         link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 798         self.css_url = url
 799         self.notify_watcher()
 800         return url
 801
 802     def set_title(self, title=None):
 803         """If a string is supplied, it becomes the book's title.
 804         Otherwise a guess is made."""
 805         if title:
 806             self.title = title
 807         else:
 808             titles = [x.text_content() for x in self.tree.cssselect('title')]
 809             if titles and titles[0]:
 810                 self.title = titles[0]
 811             else:
 812                 #oh well
 813                 self.title = 'A Manual About ' + self.book
 814         return self.title
 815
 816     def _read_localised_template(self, template, fallbacks=['en']):
 817         """Try to get the template in the approriate language, otherwise in english."""
 818         for lang in [self.lang] + fallbacks:
 819             try:
 820                 fn = template % (lang)
 821                 f = open(fn)
 822                 break
 823             except IOError, e:
 824                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 825                 log(e)
 826         template = f.read()
 827         f.close()
 828         return template
 829
 830     def compose_inside_cover(self):
 831         """create the markup for the preamble inside cover."""
 832         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 833
 834         if self.isbn:
 835             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 836         else:
 837             isbn_text = ''
 838
 839         return template % {'date': time.strftime('%Y-%m-%d'),
 840                            'isbn': isbn_text,
 841                            'license': self.license,
 842                            }
 843
 844
 845     def compose_end_matter(self):
 846         """create the markup for the end_matter inside cover.  If
 847         self.isbn is not set, the html will result in a pdf that
 848         spills onto two pages.
 849         """
 850         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 851
 852         d = {'css_url': self.css_url,
 853              'title': self.title
 854              }
 855
 856         if self.isbn:
 857             d['inside_cover_style'] = ''
 858         else:
 859             d['inside_cover_style'] = 'page-break-after: always'
 860
 861         return template % d
 862
 863
 864
 865
 866     def spawn_x(self):
 867         """Start an Xvfb instance, using a new server number.  A
 868         reference to it is stored in self.xvfb, which is used to kill
 869         it when the pdf is done.
 870
 871         Note that Xvfb doesn't interact well with dbus which is
 872         present on modern desktops.
 873         """
 874         #Find an unused server number (in case two cgis are running at once)
 875         while True:
 876             servernum = random.randrange(50, 500)
 877             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 878                 break
 879
 880         self.xserver_no = ':%s' % servernum
 881
 882         authfile = self.filepath('Xauthority')
 883         os.environ['XAUTHORITY'] = authfile
 884
 885         #mcookie(1) eats into /dev/random, so avoid that
 886         from hashlib import md5
 887         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 888         mcookie = m.hexdigest()
 889
 890         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 891
 892         self.xvfb = Popen(['Xvfb', self.xserver_no,
 893                            '-screen', '0', '1024x768x24',
 894                            '-pixdepths', '32',
 895                            #'-blackpixel', '0',
 896                            #'-whitepixel', str(2 ** 24 -1),
 897                            #'+extension', 'Composite',
 898                            '-dpi', '96',
 899                            '-kb',
 900                            '-nolisten', 'tcp',
 901                            ])
 902
 903         # We need to wait a bit before the Xvfb is ready.  but the
 904         # downloads are so slow that that probably doesn't matter
 905
 906         self.xvfb_ready_time = time.time() + 2
 907
 908         os.environ['DISPLAY'] = self.xserver_no
 909         log(self.xserver_no)
 910
 911     def wait_for_xvfb(self):
 912         """wait until a previously set time before continuing.  This
 913         is so Xvfb has time to properly start."""
 914         if hasattr(self, 'xvfb'):
 915             d = self.xvfb_ready_time - time.time()
 916             if d > 0:
 917                 time.sleep(d)
 918                 self.notify_watcher()
 919
 920     def cleanup_x(self):
 921         """Try very hard to kill off Xvfb.  In addition to killing
 922         this instance's xvfb, occasionally (randomly) search for
 923         escaped Xvfb instances and kill those too."""
 924         if not hasattr(self, 'xvfb'):
 925             return
 926         check_call(['xauth', 'remove', self.xserver_no])
 927         p = self.xvfb
 928         log("trying to kill Xvfb %s" % p.pid)
 929         os.kill(p.pid, 15)
 930         for i in range(10):
 931             if p.poll() is not None:
 932                 log("%s died with %s" % (p.pid, p.poll()))
 933                 break
 934             log("%s not dead yet" % p.pid)
 935             time.sleep(0.2)
 936         else:
 937             log("Xvfb would not die! kill -9! kill -9!")
 938             os.kill(p.pid, 9)
 939
 940         if random.random() < 0.05:
 941             #kill old xvfbs occasionally, if there are any.
 942             self.kill_old_xvfbs()
 943
 944     def kill_old_xvfbs(self):
 945         """Sometimes, despite everything, Xvfb instances hang around
 946         well after they are wanted -- for example if the cgi process
 947         dies particularly badly. So kill them if they have been
 948         running for a long time."""
 949         log("running kill_old_xvfbs")
 950         p = Popen(['ps', '-C' 'Xvfb', '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 951         data = p.communicate()[0].strip()
 952         if data:
 953             lines = data.split('\n')
 954             for line in lines:
 955                 log('dealing with ps output "%s"' % line)
 956                 try:
 957                     pid, days_, hours, minutes, seconds = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$').groups()
 958                 except AttributeError:
 959                     log("Couldn't parse that line!")
 960                 # 50 minutes should be enough xvfb time for anyone
 961                 if days or hours or int(minutes) > 50:
 962                     log("going to kill pid %s" % pid)
 963                     os.kill(int(pid), 15)
 964                     time.sleep(0.5)
 965                     os.kill(int(pid), 9)
 966         self.notify_watcher()
 967
 968     def cleanup(self):
 969         self.cleanup_x()
 970         if not config.KEEP_TEMP_FILES:
 971             for fn in os.listdir(self.workdir):
 972                 os.remove(os.path.join(self.workdir, fn))
 973             os.rmdir(self.workdir)
 974         else:
 975             log("NOT removing '%s', containing the following files:" % self.workdir)
 976             log(*os.listdir(self.workdir))
 977
 978         self.notify_watcher()
 979
 980