fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This provides abstractions of texts and virtual printers and manages
   3 # their interactions.
   4 #
   5 # Copyright (C) 2009 Douglas Bagnall
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with this program; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20
  21 """Library module representing a complete FM book being turned into a
  22 PDF"""
  23
  24 import os, sys
  25 import tempfile
  26 import re, time
  27 import random
  28 from urllib2 import urlopen
  29 from subprocess import Popen, check_call, PIPE
  30
  31 import lxml.etree, lxml.html
  32 import lxml, lxml.html, lxml.etree
  33
  34 import config
  35 from config import SERVER_DEFAULTS, DEFAULT_SERVER, POINT_2_MM, PDFEDIT_MAX_PAGES
  36
  37 TMPDIR = os.path.abspath(config.TMPDIR)
  38 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  39 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  40
  41
  42 def log(*messages, **kwargs):
  43     """Send the messages to the appropriate place (stderr, or syslog).
  44     If a <debug> keyword is specified, the message is only printed if
  45     its value ias in the global DEBUG_MODES."""
  46     if 'debug' not in kwargs or config.DEBUG_ALL or kwargs['debug'] in config.DEBUG_MODES:
  47         for m in messages:
  48             try:
  49                 print >> sys.stderr, m
  50             except Exception:
  51                 print >> sys.stderr, repr(m)
  52
  53 def _add_initial_number(e, n):
  54     """Put a styled chapter number n at the beginning of element e."""
  55     initial = e.makeelement("strong", Class="initial")
  56     e.insert(0, initial)
  57     initial.tail = ' '
  58     if e.text is not None:
  59         initial.tail += e.text
  60     e.text = ''
  61     initial.text = "%s." % n
  62
  63
  64 class TocItem(object):
  65     """This makes sense of the tuples from TOC.txt files"""
  66     def __init__(self, status, chapter, title):
  67         # status is
  68         #  0 - section heading with no chapter
  69         #  1 - chapter heading
  70         #  2 - book title
  71         #
  72         # chapter is twiki name of the chapter
  73         # title is a human readable name of the chapter.
  74         self.status = status
  75         self.chapter = chapter
  76         self.title = title
  77
  78     def is_chapter(self):
  79         return self.status == '1'
  80
  81     def is_section(self):
  82         return self.status == '0'
  83
  84     def __str__(self):
  85         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
  86
  87
  88 def run(cmd):
  89     try:
  90         p = Popen(cmd, stdout=PIPE, stderr=PIPE)
  91         out, err = p.communicate()
  92     except Exception:
  93         log("Failed on command: %r" % cmd)
  94         raise
  95     log("%s\n%s returned %s and produced\nstdout:%s\nstderr:%s" %
  96         (' '.join(cmd), cmd[0], p.poll(), out, err))
  97
  98
  99 def find_containing_paper(w, h):
 100     size = None
 101     for name, pw, ph in config.PAPER_SIZES:
 102         if pw >= w and ph >= h:
 103             mw = (pw - w) * 0.5
 104             mh = (ph - h) * 0.5
 105             return (name, mw, mh)
 106
 107     raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
 108                      (w * POINT_2_MM, h * POINT_2_MM))
 109
 110
 111
 112 class PageSettings(object):
 113     """Calculates and wraps commands for the generation and processing
 114     of PDFs"""
 115     def __init__(self, pointsize, **kwargs):
 116         # the formulas for default gutters, margins and column margins
 117         # are quite ad-hoc and certainly improvable.
 118
 119         self.width, self.height = pointsize
 120         self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
 121         self.grey_scale = 'grey_scale' in kwargs
 122
 123         # All measurements in points unless otherwise stated
 124         # user interaction is in *mm*, but is converted in objavi2.py
 125         default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
 126         default_gutter = (config.BASE_GUTTER + config.PROPORTIONAL_GUTTER * self.width)
 127
 128         self.top_margin = kwargs.get('top_margin', default_margin)
 129         self.side_margin = kwargs.get('side_margin', default_margin)
 130         self.bottom_margin = kwargs.get('bottom_margin', default_margin)
 131         self.gutter = kwargs.get('gutter', default_gutter)
 132
 133         self.columns = kwargs.get('columns', 1)
 134         if self.columns == 'auto': #default for newspapers is to work out columns
 135             self.columns = int(self.width // config.MIN_COLUMN_WIDTH)
 136
 137         self.column_margin = kwargs.get('column_margin',
 138                                         default_margin * 2 / (5.0 + self.columns))
 139
 140         self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
 141         self.number_margin = self.side_margin
 142
 143         # calculate margins in mm for browsers
 144         self.margins = []
 145         for m, clip in ((self.top_margin, clipy),
 146                         (self.side_margin, clipx + 0.5 * self.gutter),
 147                         (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
 148                         (self.side_margin, clipx + 0.5 * self.gutter),
 149                         ):
 150             self.margins.append((m + clip) * POINT_2_MM)
 151
 152         self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
 153
 154         if 'PDFGEN' in config.DEBUG_MODES:
 155             log("making PageSettings with:")
 156             for x in locals().iteritems():
 157                 log("%s: %s" % x, debug='PDFGEN')
 158             for x in dir(self):
 159                 if not x.startswith('__'):
 160                     log("self.%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
 161
 162
 163
 164     def _webkit_command(self, html, pdf, outline=False):
 165         m = [str(x) for x in self.margins]
 166         outline_args = ['--outline'] * outline
 167         greyscale_args = ['-g'] * self.grey_scale
 168         cmd = ([config.WKHTMLTOPDF, '-q', '-s', self.papersize,
 169                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
 170                '-d', '100'] + outline_args + greyscale_args +
 171                config.WKHTMLTOPDF_EXTRA_COMMANDS + [html, pdf])
 172         log(' '.join(cmd))
 173         return cmd
 174
 175     def _gecko_command(self, html, pdf, outline=False):
 176         m = [str(x) for x in self.margins]
 177         #firefox -P pdfprint -print URL -printprinter "printer_settings"
 178         cmd = [config.FIREFOX, '-P', 'pdfprint', '-print',
 179                html, '-printprinter', self.moz_printer]
 180         log(' '.join(cmd))
 181         return cmd
 182
 183     def make_raw_pdf(self, html, pdf, engine='webkit', outline=False):
 184         func = getattr(self, '_%s_command' % engine)
 185         if self.columns == 1:
 186             cmd = func(html, pdf, outline=outline)
 187             run(cmd)
 188         else:
 189             printable_width = self.width - 2.0 * self.side_margin - self.gutter
 190             column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
 191             page_width = column_width + self.column_margin
 192             side_margin = self.column_margin * 0.5
 193             if 'PDFGEN' in config.DEBUG_MODES:
 194                 log("making columns with:")
 195                 for k, v in locals().iteritems():
 196                     log("%s: %r" % (k, v))
 197                 for k in ('width', 'side_margin', 'gutter', 'column_margin', 'columns', 'height'):
 198                     log("self.%s: %r" % (k, getattr(self, k)))
 199
 200             columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
 201                                        gutter=0, top_margin=self.top_margin,
 202                                        side_margin=side_margin,
 203                                        bottom_margin=self.bottom_margin,
 204                                        grey_scale=self.grey_scale,
 205                                        )
 206
 207             column_pdf = pdf[:-4] + '-single-column.pdf'
 208             columnmaker.make_raw_pdf(html, column_pdf, engine=engine, outline=outline)
 209             columnmaker.reshape_pdf(column_pdf)
 210
 211             cmd = ['pdfnup',
 212                    '--nup', '%sx1' % int(self.columns),
 213                    '--paper', self.papersize.lower() + 'paper',
 214                    '--outfile', pdf,
 215                    '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
 216                    '--noautoscale', 'true',
 217                    '--orient', 'portrait',
 218                    #'--tidy', 'false',
 219                    column_pdf
 220                    ]
 221
 222             run(cmd)
 223
 224
 225
 226     def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
 227                     even_pages=True):
 228         """Spin the pdf for RTL text, resize it to the right size, and
 229         shift the gutter left and right"""
 230         ops = 'resize'
 231         if self.gutter:
 232             ops += ',shift'
 233         if even_pages:
 234             ops += ',even_pages'
 235         gutter = self.gutter
 236         if dir == 'RTL':
 237             gutter = -gutter
 238         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 239                'dir=%s' % dir,
 240                'filename=%s' % pdf,
 241                'output_filename=%s' % pdf,
 242                'operation=%s' % ops,
 243                'width=%s' % self.width,
 244                'height=%s' % self.height,
 245                'offset=%s' % gutter,
 246                'centre_start=%s' % centre_start,
 247                'centre_end=%s' % centre_end,
 248                ]
 249         run(cmd)
 250
 251     def _number_pdf(self, pdf, numbers='latin', dir='LTR',
 252                     number_start=1):
 253         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 254                'operation=page_numbers',
 255                'dir=%s' % dir,
 256                'filename=%s' % pdf,
 257                'output_filename=%s' % pdf,
 258                'number_start=%s' % number_start,
 259                'number_style=%s' % numbers,
 260                'number_bottom=%s' % self.number_bottom,
 261                'number_margin=%s' % self.number_margin,
 262                ]
 263         run(cmd)
 264
 265     def number_pdf(self, pdf, pages, **kwargs):
 266         # if there are too many pages for pdfedit to handle in one go,
 267         # split the job into bits.  <pages> may not be exact
 268         if pages is None or pages <= PDFEDIT_MAX_PAGES:
 269             self._number_pdf(pdf, **kwargs)
 270         else:
 271             # section_size must be even
 272             sections = pages // PDFEDIT_MAX_PAGES + 1
 273             section_size = (pages // sections + 2) & ~1
 274
 275             pdf_sections = []
 276             s = kwargs.pop('number_start', 1)
 277             while s < pages:
 278                 e = s + section_size - 1
 279                 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
 280                 if e < pages - 1:
 281                     page_range = '%s-%s' % (s, e)
 282                 else:
 283                     page_range = '%s-end' % s
 284                 run(['pdftk',
 285                      pdf,
 286                      'cat',
 287                      page_range,
 288                      'output',
 289                      pdf_section,
 290                      ])
 291                 self._number_pdf(pdf_section, number_start=s, **kwargs)
 292                 pdf_sections.append(pdf_section)
 293                 s = e + 1
 294
 295             concat_pdfs(pdf, *pdf_sections)
 296
 297     def make_barcode_pdf(self, isbn, pdf, corner='br'):
 298         """Put an ISBN barcode in a corner of a single blank page."""
 299
 300         position = '%s,%s,%s,%s,%s' %(corner, self.width, self.height, self.side_margin, self.bottom_margin)
 301         cmd1 = [config.BOOKLAND,
 302                 '--position', position,
 303                 str(isbn)]
 304         cmd2 = ['ps2pdf',
 305                 '-dFIXEDMEDIA',
 306                 '-dDEVICEWIDTHPOINTS=%s' % self.width,
 307                 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
 308                 '-', pdf]
 309
 310         p1 = Popen(cmd1, stdout=PIPE)
 311         p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
 312         out, err = p2.communicate()
 313
 314         log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
 315         log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
 316
 317
 318 def count_pdf_pages(pdf):
 319     """How many pages in the PDF?"""
 320     #XXX could also use python-pypdf or python-poppler
 321     cmd = ('pdfinfo', pdf)
 322     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 323     out, err = p.communicate()
 324     m = re.search(r'^\s*Pages:\s*(\d+)\s*$', re.MULTILINE)
 325     return int(m.group(1))
 326
 327
 328 def concat_pdfs(destination, *pdfs):
 329     """Join all the named pdfs together into one and save it as <name>"""
 330     cmd = ['pdftk']
 331     cmd.extend(x for x in pdfs if x is not None)
 332     cmd += ['cat', 'output', destination]
 333     run(cmd)
 334
 335 def index_pdf(pdf, text=None):
 336     """Use pdftotext to extract utf-8 text from a pdf, using ^L to
 337     separate pages."""
 338     if text is None:
 339         text = pdf + '.index.txt'
 340     cmd = ['pdftotext',
 341            #'-layout', #keeps more original formatting
 342            pdf,
 343            text]
 344     run(cmd)
 345     return text
 346
 347 def rotate_pdf(pdfin, pdfout):
 348     """Turn the PDF on its head"""
 349     cmd = ['pdftk', pdfin,
 350            'cat',
 351            '1-endD',
 352            'output',
 353            pdfout
 354            ]
 355     run(cmd)
 356
 357 def parse_outline(pdf, level_threshold):
 358     """Create a structure reflecting the outline of a PDF.
 359     A chapter heading looks like this:
 360
 361     BookmarkTitle: 2. What is sound?
 362     BookmarkLevel: 1
 363     BookmarkPageNumber: 3
 364     """
 365     cmd = ('pdftk', pdf, 'dump_data')
 366     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 367     outline, err = p.communicate()
 368     lines = (x.strip() for x in outline.split('\n') if x.strip())
 369     contents = []
 370
 371     def extract(expected, conv=str.strip):
 372         line = lines.next()
 373         try:
 374             k, v = line.split(':', 1)
 375             if k == expected:
 376                 return conv(v)
 377         except ValueError:
 378             log("trouble with line %r" %line)
 379
 380     #There are a few useless variables, then the pagecount, then the contents.
 381     #The pagecount is useful, so pick it up first.
 382     page_count = None
 383     while page_count == None:
 384         page_count = extract('NumberOfPages', int)
 385
 386     try:
 387         while True:
 388             title = extract('BookmarkTitle')
 389             if title is not None:
 390                 level = extract('BookmarkLevel', int)
 391                 pagenum = extract('BookmarkPageNumber', int)
 392                 if level <= level_threshold and None not in (level, pagenum):
 393                     contents.append((title, level, pagenum))
 394     except StopIteration:
 395         pass
 396
 397     return contents, outline, page_count
 398
 399
 400 class Book(object):
 401     page_numbers = 'latin'
 402     preamble_page_numbers = 'roman'
 403     engine= 'webkit'
 404     _try_cleanup_on_del = config.TRY_BOOK_CLEANUP_ON_DEL
 405
 406     def notify_watcher(self, message=None):
 407         if self.watcher:
 408             if  message is None:
 409                 #message is the name of the caller
 410                 #XXX look at using inspect module
 411                 import traceback
 412                 message = traceback.extract_stack(None, 2)[0][2]
 413             log("notify_watcher called with '%s'" % message)
 414             self.watcher(message)
 415
 416     def __enter__(self):
 417         return self
 418
 419     def __exit__(self, exc_type, exc_value, traceback):
 420         self.cleanup()
 421         #could deal with exceptions here and return true
 422
 423     def __init__(self, book, server, bookname,
 424                  page_settings=None, engine=None, watcher=None, isbn=None,
 425                  license=config.DEFAULT_LICENSE):
 426         log("*** Starting new book %s ***" % bookname)
 427         self.book = book
 428         self.server = server
 429         self.watcher = watcher
 430         self.isbn = isbn
 431         self.license = license
 432         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 433         os.chmod(self.workdir, 0755)
 434         defaults = SERVER_DEFAULTS[server]
 435         self.lang = defaults['lang']
 436         self.dir  = defaults['dir']
 437
 438         self.body_html_file = self.filepath('body.html')
 439         self.body_pdf_file = self.filepath('body.pdf')
 440         self.body_index_file = self.filepath('body.txt')
 441         self.preamble_html_file = self.filepath('preamble.html')
 442         self.preamble_pdf_file = self.filepath('preamble.pdf')
 443         self.tail_html_file = self.filepath('tail.html')
 444         self.tail_pdf_file = self.filepath('tail.pdf')
 445         self.isbn_pdf_file = None
 446         self.pdf_file = self.filepath('final.pdf')
 447
 448         self.publish_name = bookname
 449         self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
 450         self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
 451
 452         self.book_url = config.BOOK_URL % (self.server, self.book)
 453         self.toc_url = config.TOC_URL % (self.server, self.book)
 454
 455         self.maker = PageSettings(**page_settings)
 456
 457         if engine is not None:
 458             self.engine = engine
 459         self.notify_watcher()
 460
 461     if config.TRY_BOOK_CLEANUP_ON_DEL:
 462         #Dont even define __del__ if it is not used.
 463         _try_cleanup_on_del = True
 464         def __del__(self):
 465             if self._try_cleanup_on_del and os.path.exists(self.workdir):
 466                 self._try_cleanup_on_del = False #or else you can get in bad cycles
 467                 self.cleanup()
 468
 469     def __getattr__(self, attr):
 470         """catch unloaded books and load them"""
 471         #log('looking for missing attribute "%s"' % (attr))
 472         if attr == 'tree':
 473             self.load_book()
 474             return self.tree
 475         if attr == 'toc':
 476             self.load_toc()
 477             return self.toc
 478         raise AttributeError("no such member: '%s'" % attr)
 479
 480
 481     def filepath(self, fn):
 482         return os.path.join(self.workdir, fn)
 483
 484     def save_data(self, fn, data):
 485         """Save without tripping up on unicode"""
 486         if isinstance(data, unicode):
 487             data = data.encode('utf8', 'ignore')
 488         f = open(fn, 'w')
 489         f.write(data)
 490         f.close()
 491
 492     def save_tempfile(self, fn, data):
 493         """Save the data in a temporary directory that will be cleaned
 494         up when all is done.  Return the absolute file path."""
 495         fn = self.filepath(fn)
 496         self.save_data(fn, data)
 497         return fn
 498
 499     def extract_pdf_outline(self):
 500         self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
 501         for x in self.outline_contents:
 502             log(x)
 503         return number_of_pages
 504
 505     def make_body_pdf(self):
 506         """Make a pdf of the HTML, using webkit"""
 507         #1. Save the html
 508         html_text = lxml.etree.tostring(self.tree, method="html")
 509         self.save_data(self.body_html_file, html_text)
 510
 511         #2. Make a pdf of it
 512         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file,
 513                                 engine=self.engine, outline=True)
 514         self.notify_watcher('generate_pdf')
 515
 516         n_pages = self.extract_pdf_outline()
 517
 518         log ("found %s pages in pdf" % n_pages)
 519         #4. resize pages, shift gutters, even pages
 520         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 521         self.notify_watcher('reshape_pdf')
 522
 523         #5 add page numbers
 524         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 525                               numbers=self.page_numbers)
 526         self.notify_watcher("number_pdf")
 527         self.notify_watcher()
 528
 529     def make_preamble_pdf(self):
 530         contents = self.make_contents()
 531         inside_cover_html = self.compose_inside_cover()
 532         html = ('<html dir="%s"><head>\n'
 533                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 534                 '<link rel="stylesheet" href="%s" />\n'
 535                 '</head>\n<body>\n'
 536                 '<h1 class="frontpage">%s</h1>'
 537                 '%s\n'
 538                 '<div class="contents">%s</div>\n'
 539                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 540                 '<!--%s--></div></body></html>'
 541                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 542                      contents, self.title)
 543         self.save_data(self.preamble_html_file, html)
 544
 545         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file,
 546                                 engine=self.engine)
 547
 548         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 549
 550         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 551                             numbers=self.preamble_page_numbers,
 552                             number_start=-2)
 553
 554         self.notify_watcher()
 555
 556     def make_end_matter_pdf(self):
 557         """Make an inside back cover and a back cover.  If there is an
 558         isbn number its barcode will be put on the back cover."""
 559         if self.isbn:
 560             self.isbn_pdf_file = self.filepath('isbn.pdf')
 561             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 562             self.notify_watcher('make_barcode_pdf')
 563
 564         self.save_data(self.tail_html_file, self.compose_end_matter())
 565         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file,
 566                                 engine=self.engine)
 567
 568         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 569                                centre_end=True, even_pages=False)
 570         self.notify_watcher()
 571
 572     def make_book_pdf(self):
 573         """A convenient wrapper of a few necessary steps"""
 574         # now the Xvfb server is needed. make sure it has had long enough to get going
 575         self.wait_for_xvfb()
 576         self.make_body_pdf()
 577         self.make_preamble_pdf()
 578         self.make_end_matter_pdf()
 579
 580         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 581                     self.body_pdf_file, self.tail_pdf_file,
 582                     self.isbn_pdf_file)
 583
 584         self.notify_watcher('concatenated_pdfs')
 585
 586
 587     def make_simple_pdf(self, mode):
 588         """Make a simple pdf document without contents or separate
 589         title page.  This is used for multicolumn newspapers and for
 590         web-destined pdfs."""
 591         self.wait_for_xvfb()
 592         #0. Add heading to begining of html
 593         body = list(self.tree.cssselect('body'))[0]
 594         e = body.makeelement('h1', {'id': 'book-title'})
 595         e.text = self.title
 596         body.insert(0, e)
 597         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 598         e.addnext(intro)
 599
 600         #0.5 adjust parameters to suit the particular kind of output
 601         if mode == 'web':
 602             self.maker.gutter = 0
 603
 604         #1. Save the html
 605         html_text = lxml.etree.tostring(self.tree, method="html")
 606         self.save_data(self.body_html_file, html_text)
 607
 608         #2. Make a pdf of it (direct to to final pdf)
 609         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file,
 610                                 engine=self.engine, outline=True)
 611         self.notify_watcher('generate_pdf')
 612         #n_pages = self.extract_pdf_outline()
 613         n_pages = count_pdf_pages(self.pdf_file)
 614
 615         if mode != 'web':
 616             #3. resize pages and shift gutters.
 617             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 618             self.notify_watcher('reshape_pdf')
 619
 620             #4. add page numbers
 621             self.maker.number_pdf(self.pdf_file, n_pages,
 622                                   dir=self.dir, numbers=self.page_numbers)
 623             self.notify_watcher("number_pdf")
 624         self.notify_watcher()
 625
 626
 627     def rotate180(self):
 628         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 629         presses."""
 630         rotated = self.filepath('final-rotate.pdf')
 631         unrotated = self.filepath('final-pre-rotate.pdf')
 632         #leave the unrotated pdf intact at first, in case of error.
 633         rotate_pdf(self.pdf_file, rotated)
 634         os.rename(self.pdf_file, unrotated)
 635         os.rename(rotated, self.pdf_file)
 636         self.notify_watcher()
 637
 638     def publish_pdf(self):
 639         """Move the finished PDF to its final resting place"""
 640         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 641         os.rename(self.pdf_file, self.publish_file)
 642         self.notify_watcher()
 643
 644     def load_toc(self):
 645         """From the TOC.txt file create a list of TocItems with
 646         the attributes <status>, <chapter>, and <title>.
 647
 648         <status> is a number, with the following meaning:
 649
 650               0 - section heading with no chapter
 651               1 - chapter heading
 652               2 - book title
 653
 654         The TocItem object has convenience functions <is_chapter> and
 655         <is_section>.
 656
 657         <chapter> is twiki name of the chapter.
 658
 659         <title> is a human readable title for the chapter.  It is likely to
 660         differ from the title given in the chapter's <h1> heading.
 661         """
 662         f = urlopen(self.toc_url)
 663         self.toc = []
 664         while True:
 665             try:
 666                 self.toc.append(TocItem(f.next().strip(),
 667                                         f.next().strip(),
 668                                         f.next().strip()))
 669             except StopIteration:
 670                 break
 671         f.close()
 672         self.notify_watcher()
 673
 674     def load_book(self, tidy=True):
 675         """Fetch and parse the raw html of the book.  If tidy is true
 676         (default) links in the document will be made absolute."""
 677         f = urlopen(self.book_url)
 678         html = f.read()
 679         f.close()
 680         html = ('<html dir="%s"><head>\n<title>%s</title>\n'
 681                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 682                 '</head>\n<body>\n'
 683                 '%s\n'
 684                 '<div style="page-break-before: always; color:#fff;" class="unseen">'
 685                 'A FLOSSManuals book</div>\n</body></html>'
 686                 ) % (self.dir, self.book, html)
 687
 688         self.save_tempfile('raw.html', html)
 689
 690         tree = lxml.html.document_fromstring(html)
 691         if tidy:
 692             tree.make_links_absolute(self.book_url)
 693         self.tree = tree
 694         self.headings = [x for x in tree.cssselect('h1')]
 695         if self.headings:
 696             self.headings[0].set('class', "first-heading")
 697         for h1 in self.headings:
 698             h1.title = h1.text_content().strip()
 699         self.notify_watcher()
 700
 701     def load(self):
 702         """Wrapper around all necessary load methods."""
 703         self.load_book()
 704         self.load_toc()
 705
 706     def make_contents(self):
 707         """Generate HTML containing the table of contents.  This can
 708         only be done after the main PDF has been made."""
 709         header = '<h1>Table of Contents</h1><table class="toc">\n'
 710         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 711                     '<td class="pagenumber">%s</td></tr>\n')
 712         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 713         footer = '\n</table>'
 714
 715         contents = []
 716
 717         chapter = 1
 718         page_num = 1
 719         subsections = [] # for the subsection heading pages.
 720
 721         outline_contents = iter(self.outline_contents)
 722         headings = iter(self.headings)
 723
 724         for t in self.toc:
 725             if t.is_chapter():
 726                 try:
 727                     h1 = headings.next()
 728                 except StopIteration:
 729                     log("heading not found for %s (previous h1 missing?). Stopping" % t)
 730                     break
 731                 h1_text, level, page_num = outline_contents.next()
 732                 log("%r %r" % (h1.title, h1_text))
 733                 contents.append(row_tmpl % (chapter, h1.title, page_num))
 734                 chapter += 1
 735             elif t.is_section():
 736                 contents.append(section_tmpl % t.title)
 737             else:
 738                 log("mystery TOC item: %s" % t)
 739
 740         doc = header + '\n'.join(contents) + footer
 741         self.notify_watcher()
 742         return doc
 743
 744     def add_section_titles(self):
 745         """Add any section heading pages that the TOC.txt file
 746         specifies.  These are sub-book, super-chapter groupings.
 747
 748         Also add initial numbers to chapters.
 749         """
 750         headings = iter(self.headings)
 751         chapter = 1
 752         section = None
 753
 754         for t in self.toc:
 755             if t.is_chapter() and section is not None:
 756                 try:
 757                     h1 = headings.next()
 758                 except StopIteration:
 759                     log("heading not found for %s (previous h1 missing?)" % t)
 760                     break
 761                 item = h1.makeelement('div', Class='chapter')
 762                 log(h1.title, debug='HTMLGEN')
 763                 item.text = h1.title
 764                 _add_initial_number(item, chapter)
 765
 766                 section.append(item)
 767
 768                 if not section_placed:
 769                     log("placing section", debug='HTMLGEN')
 770                     h1.addprevious(section)
 771                     section_placed = True
 772                 else:
 773                     log("NOT placing section", debug='HTMLGEN')
 774
 775                 #put a bold number at the beginning of the h1.
 776                 _add_initial_number(h1, chapter)
 777                 chapter += 1
 778
 779             elif t.is_section():
 780                 section = self.tree.makeelement('div', Class="subsection")
 781                 # section Element complains when you try to ask it whether it
 782                 # has been placed (though it does know)
 783                 section_placed = False
 784                 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
 785                 heading.set("Class", "subsection-heading")
 786                 section.append(heading)
 787
 788         self.notify_watcher()
 789
 790
 791     def add_css(self, css=None, mode='book'):
 792         """If css looks like a url, use it as a stylesheet link.
 793         Otherwise it is the CSS itself, which is saved to a temporary file
 794         and linked to."""
 795         log("css is %r" % css)
 796         htmltree = self.tree
 797         if css is None or not css.strip():
 798             defaults = SERVER_DEFAULTS[self.server]
 799             url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
 800         elif not re.match(r'^http://\S+$', css):
 801             fn = self.save_tempfile('objavi.css', css)
 802             url = 'file://' + fn
 803         else:
 804             url = css
 805         #XXX for debugging and perhaps sensible anyway
 806         #url = url.replace('file:///home/douglas/objavi2', '')
 807
 808
 809         #find the head -- it's probably first child but lets not assume.
 810         for child in htmltree:
 811             if child.tag == 'head':
 812                 head = child
 813                 break
 814         else:
 815             head = htmltree.makeelement('head')
 816             htmltree.insert(0, head)
 817
 818         link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 819         self.css_url = url
 820         self.notify_watcher()
 821         return url
 822
 823     def set_title(self, title=None):
 824         """If a string is supplied, it becomes the book's title.
 825         Otherwise a guess is made."""
 826         if title:
 827             self.title = title
 828         else:
 829             titles = [x.text_content() for x in self.tree.cssselect('title')]
 830             if titles and titles[0]:
 831                 self.title = titles[0]
 832             else:
 833                 #oh well
 834                 self.title = 'A Manual About ' + self.book
 835         return self.title
 836
 837     def _read_localised_template(self, template, fallbacks=['en']):
 838         """Try to get the template in the approriate language, otherwise in english."""
 839         for lang in [self.lang] + fallbacks:
 840             try:
 841                 fn = template % (lang)
 842                 f = open(fn)
 843                 break
 844             except IOError, e:
 845                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 846                 log(e)
 847         template = f.read()
 848         f.close()
 849         return template
 850
 851     def compose_inside_cover(self):
 852         """create the markup for the preamble inside cover."""
 853         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 854
 855         if self.isbn:
 856             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 857         else:
 858             isbn_text = ''
 859
 860         return template % {'date': time.strftime('%Y-%m-%d'),
 861                            'isbn': isbn_text,
 862                            'license': self.license,
 863                            }
 864
 865
 866     def compose_end_matter(self):
 867         """create the markup for the end_matter inside cover.  If
 868         self.isbn is not set, the html will result in a pdf that
 869         spills onto two pages.
 870         """
 871         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 872
 873         d = {'css_url': self.css_url,
 874              'title': self.title
 875              }
 876
 877         if self.isbn:
 878             d['inside_cover_style'] = ''
 879         else:
 880             d['inside_cover_style'] = 'page-break-after: always'
 881
 882         return template % d
 883
 884
 885
 886
 887     def spawn_x(self):
 888         """Start an Xvfb instance, using a new server number.  A
 889         reference to it is stored in self.xvfb, which is used to kill
 890         it when the pdf is done.
 891
 892         Note that Xvfb doesn't interact well with dbus which is
 893         present on modern desktops.
 894         """
 895         #Find an unused server number (in case two cgis are running at once)
 896         while True:
 897             servernum = random.randrange(50, 500)
 898             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 899                 break
 900
 901         self.xserver_no = ':%s' % servernum
 902
 903         authfile = self.filepath('Xauthority')
 904         os.environ['XAUTHORITY'] = authfile
 905
 906         #mcookie(1) eats into /dev/random, so avoid that
 907         from hashlib import md5
 908         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 909         mcookie = m.hexdigest()
 910
 911         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 912
 913         self.xvfb = Popen(['Xvfb', self.xserver_no,
 914                            '-screen', '0', '1024x768x24',
 915                            '-pixdepths', '32',
 916                            #'-blackpixel', '0',
 917                            #'-whitepixel', str(2 ** 24 -1),
 918                            #'+extension', 'Composite',
 919                            '-dpi', '96',
 920                            '-kb',
 921                            '-nolisten', 'tcp',
 922                            ])
 923
 924         # We need to wait a bit before the Xvfb is ready.  but the
 925         # downloads are so slow that that probably doesn't matter
 926
 927         self.xvfb_ready_time = time.time() + 2
 928
 929         os.environ['DISPLAY'] = self.xserver_no
 930         log(self.xserver_no)
 931
 932     def wait_for_xvfb(self):
 933         """wait until a previously set time before continuing.  This
 934         is so Xvfb has time to properly start."""
 935         if hasattr(self, 'xvfb'):
 936             d = self.xvfb_ready_time - time.time()
 937             if d > 0:
 938                 time.sleep(d)
 939                 self.notify_watcher()
 940
 941     def cleanup_x(self):
 942         """Try very hard to kill off Xvfb.  In addition to killing
 943         this instance's xvfb, occasionally (randomly) search for
 944         escaped Xvfb instances and kill those too."""
 945         if not hasattr(self, 'xvfb'):
 946             return
 947         check_call(['xauth', 'remove', self.xserver_no])
 948         p = self.xvfb
 949         log("trying to kill Xvfb %s" % p.pid)
 950         os.kill(p.pid, 15)
 951         for i in range(10):
 952             if p.poll() is not None:
 953                 log("%s died with %s" % (p.pid, p.poll()))
 954                 break
 955             log("%s not dead yet" % p.pid)
 956             time.sleep(0.2)
 957         else:
 958             log("Xvfb would not die! kill -9! kill -9!")
 959             os.kill(p.pid, 9)
 960
 961         if random.random() < 0.05:
 962             #kill old xvfbs occasionally, if there are any.
 963             self.kill_old_xvfbs()
 964
 965     def kill_old_xvfbs(self):
 966         """Sometimes, despite everything, Xvfb instances hang around
 967         well after they are wanted -- for example if the cgi process
 968         dies particularly badly. So kill them if they have been
 969         running for a long time."""
 970         log("running kill_old_xvfbs")
 971         p = Popen(['ps', '-C' 'Xvfb', '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 972         data = p.communicate()[0].strip()
 973         if data:
 974             lines = data.split('\n')
 975             for line in lines:
 976                 log('dealing with ps output "%s"' % line)
 977                 try:
 978                     pid, days_, hours, minutes, seconds = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$').groups()
 979                 except AttributeError:
 980                     log("Couldn't parse that line!")
 981                 # 50 minutes should be enough xvfb time for anyone
 982                 if days or hours or int(minutes) > 50:
 983                     log("going to kill pid %s" % pid)
 984                     os.kill(int(pid), 15)
 985                     time.sleep(0.5)
 986                     os.kill(int(pid), 9)
 987         self.notify_watcher()
 988
 989     def cleanup(self):
 990         self.cleanup_x()
 991         if not config.KEEP_TEMP_FILES:
 992             for fn in os.listdir(self.workdir):
 993                 os.remove(os.path.join(self.workdir, fn))
 994             os.rmdir(self.workdir)
 995         else:
 996             log("NOT removing '%s', containing the following files:" % self.workdir)
 997             log(*os.listdir(self.workdir))
 998
 999         self.notify_watcher()
1000
1001