fmbook.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This provides abstractions of texts and virtual printers and manages
   3 # their interactions.
   4 #
   5 # Copyright (C) 2009 Douglas Bagnall
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with this program; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20
  21 """Library module representing a complete FM book being turned into a
  22 PDF"""
  23
  24 import os, sys
  25 import tempfile
  26 import re, time
  27 import random
  28 from urllib2 import urlopen
  29 from subprocess import Popen, check_call, PIPE
  30
  31 import lxml.etree, lxml.html
  32 import lxml, lxml.html, lxml.etree
  33
  34 import config
  35 from config import SERVER_DEFAULTS, DEFAULT_SERVER, POINT_2_MM, PDFEDIT_MAX_PAGES
  36
  37 TMPDIR = os.path.abspath(config.TMPDIR)
  38 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
  39 PUBLISH_PATH = "%s/books/" % DOC_ROOT
  40
  41
  42 def log(*messages, **kwargs):
  43     """Send the messages to the appropriate place (stderr, or syslog).
  44     If a <debug> keyword is specified, the message is only printed if
  45     its value ias in the global DEBUG_MODES."""
  46     if 'debug' not in kwargs or config.DEBUG_ALL or kwargs['debug'] in config.DEBUG_MODES:
  47         for m in messages:
  48             try:
  49                 print >> sys.stderr, m
  50             except Exception:
  51                 print >> sys.stderr, repr(m)
  52
  53 def _add_initial_number(e, n):
  54     """Put a styled chapter number n at the beginning of element e."""
  55     initial = e.makeelement("strong", Class="initial")
  56     e.insert(0, initial)
  57     initial.tail = ' '
  58     if e.text is not None:
  59         initial.tail += e.text
  60     e.text = ''
  61     initial.text = "%s." % n
  62
  63 def _add_chapter_cookie(e):
  64     """add magic hidden text to help with contents generation"""
  65     cookie = e.makeelement("span", Class="heading-cookie", dir="ltr",
  66                            style="font-size:6pt; line-height: 6pt; color: #fff; width:0;"
  67                            " float:left; margin:-2em; z-index: -67; display: block;"
  68                            )
  69     cookie.text = ''.join(random.choice(config.CHAPTER_COOKIE_CHARS) for x in range(8))
  70     e.cookie = cookie.text
  71     e.addnext(cookie)
  72     #e.append(cookie)
  73
  74
  75 class TocItem(object):
  76     """This makes sense of the tuples from TOC.txt files"""
  77     def __init__(self, status, chapter, title):
  78         # status is
  79         #  0 - section heading with no chapter
  80         #  1 - chapter heading
  81         #  2 - book title
  82         #
  83         # chapter is twiki name of the chapter
  84         # title is a human readable name of the chapter.
  85         self.status = status
  86         self.chapter = chapter
  87         self.title = title
  88
  89     def is_chapter(self):
  90         return self.status == '1'
  91
  92     def is_section(self):
  93         return self.status == '0'
  94
  95     def __str__(self):
  96         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
  97
  98
  99 def run(cmd):
 100     try:
 101         p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 102         out, err = p.communicate()
 103     except Exception:
 104         log("Failed on command: %r" % cmd)
 105         raise
 106     log("%s\n%s returned %s and produced\nstdout:%s\nstderr:%s" %
 107         (' '.join(cmd), cmd[0], p.poll(), out, err))
 108
 109
 110 def find_containing_paper(w, h):
 111     size = None
 112     for name, pw, ph in config.PAPER_SIZES:
 113         if pw >= w and ph >= h:
 114             mw = (pw - w) * 0.5
 115             mh = (ph - h) * 0.5
 116             return (name, mw, mh)
 117
 118     raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
 119                      (w * POINT_2_MM, h * POINT_2_MM))
 120
 121
 122
 123 class PageSettings(object):
 124     """Calculates and wraps commands for the generation and processing
 125     of PDFs"""
 126     def __init__(self, pointsize, **kwargs):
 127         # the formulas for default gutters, margins and column margins
 128         # are quite ad-hoc and certainly improvable.
 129
 130         self.width, self.height = pointsize
 131         self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
 132
 133         self.gutter = kwargs.get('gutter', (config.BASE_GUTTER +
 134                                             config.PROPORTIONAL_GUTTER * self.width))
 135
 136         default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
 137         self.top_margin = kwargs.get('top_margin', default_margin)
 138         self.side_margin = kwargs.get('top_margin', default_margin)
 139         self.bottom_margin = kwargs.get('top_margin', default_margin)
 140         self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
 141         self.columns = kwargs.get('columns', 1)
 142
 143         self.column_margin = kwargs.get('column_margin', default_margin * 2 / (4.0 + self.columns))
 144
 145         self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
 146         self.number_margin = self.side_margin
 147
 148         # calculate margins in mm for browsers
 149         self.margins = []
 150         for m, clip in ((self.top_margin, clipy),
 151                         (self.side_margin, clipx + 0.5 * self.gutter),
 152                         (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
 153                         (self.side_margin, clipx + 0.5 * self.gutter),
 154                         ):
 155             if m is None:
 156                 m = default_margin
 157             self.margins.append((m + clip) * POINT_2_MM)
 158
 159         for x in locals().iteritems():
 160             log("%s: %s" % x, debug='PDFGEN')
 161         for x in dir(self):
 162             log("%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
 163
 164
 165
 166     def _webkit_command(self, html, pdf, outline=False):
 167         m = [str(x) for x in self.margins]
 168         outline_args = ['--outline'] * outline
 169         cmd = ([config.WKHTMLTOPDF, '-q', '-s', self.papersize,
 170                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
 171                ] + outline_args +
 172                config.WKHTMLTOPDF_EXTRA_COMMANDS + [html, pdf])
 173         log(' '.join(cmd))
 174         return cmd
 175
 176     def _gecko_command(self, html, pdf, outline=False):
 177         m = [str(x) for x in self.margins]
 178         #firefox -P pdfprint -print URL -printprinter "printer_settings"
 179         cmd = [config.FIREFOX, '-P', 'pdfprint', '-print',
 180                html, '-printprinter', self.moz_printer]
 181         log(' '.join(cmd))
 182         return cmd
 183
 184     def make_raw_pdf(self, html, pdf, engine='webkit', outline=False):
 185         func = getattr(self, '_%s_command' % engine)
 186         if self.columns == 1:
 187             cmd = func(html, pdf, outline=outline)
 188             run(cmd)
 189         else:
 190             printable_width = self.width - 2.0 * self.side_margin - self.gutter
 191             column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
 192             page_width = column_width + self.column_margin
 193
 194             columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
 195                                        gutter=0, top_margin=self.top_margin,
 196                                        side_margin=self.column_margin * 0.5,
 197                                        bottom_margin=self.bottom_margin)
 198
 199             column_pdf = pdf[:-4] + '-single-column.pdf'
 200             columnmaker.make_raw_pdf(html, column_pdf, engine=engine)
 201             columnmaker.reshape_pdf(column_pdf)
 202
 203             cmd = ['pdfnup',
 204                    '--nup', '%sx1' % int(self.columns),
 205                    '--paper', self.papersize.lower() + 'paper',
 206                    '--outfile', pdf,
 207                    '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
 208                    '--noautoscale', 'true',
 209                    '--orient', 'portrait',
 210                    #'--tidy', 'false',
 211                    column_pdf
 212                    ]
 213
 214             run(cmd)
 215
 216
 217
 218     def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
 219                     even_pages=True):
 220         """Spin the pdf for RTL text, resize it to the right size, and
 221         shift the gutter left and right"""
 222         ops = 'resize'
 223         if self.gutter:
 224             ops += ',shift'
 225         if even_pages:
 226             ops += ',even_pages'
 227         gutter = self.gutter
 228         if dir == 'RTL':
 229             gutter = -gutter
 230         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 231                'dir=%s' % dir,
 232                'filename=%s' % pdf,
 233                'output_filename=%s' % pdf,
 234                'operation=%s' % ops,
 235                'width=%s' % self.width,
 236                'height=%s' % self.height,
 237                'offset=%s' % gutter,
 238                'centre_start=%s' % centre_start,
 239                'centre_end=%s' % centre_end,
 240                ]
 241         run(cmd)
 242
 243     def _number_pdf(self, pdf, numbers='latin', dir='LTR',
 244                     number_start=1):
 245         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 246                'operation=page_numbers',
 247                'dir=%s' % dir,
 248                'filename=%s' % pdf,
 249                'output_filename=%s' % pdf,
 250                'number_start=%s' % number_start,
 251                'number_style=%s' % numbers,
 252                'number_bottom=%s' % self.number_bottom,
 253                'number_margin=%s' % self.number_margin,
 254                ]
 255         run(cmd)
 256
 257     def number_pdf(self, pdf, pages, **kwargs):
 258         # if there are too many pages for pdfedit to handle in one go,
 259         # split the job into bits.  <pages> may not be exact
 260         if pages is None or pages <= PDFEDIT_MAX_PAGES:
 261             self._number_pdf(pdf, **kwargs)
 262         else:
 263             # section_size must be even
 264             sections = pages // PDFEDIT_MAX_PAGES + 1
 265             section_size = (pages // sections + 2) & ~1
 266
 267             pdf_sections = []
 268             s = kwargs.pop('number_start', 1)
 269             while s < pages:
 270                 e = s + section_size - 1
 271                 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
 272                 if e < pages - 1:
 273                     page_range = '%s-%s' % (s, e)
 274                 else:
 275                     page_range = '%s-end' % s
 276                 run(['pdftk',
 277                      pdf,
 278                      'cat',
 279                      page_range,
 280                      'output',
 281                      pdf_section,
 282                      ])
 283                 self._number_pdf(pdf_section, number_start=s, **kwargs)
 284                 pdf_sections.append(pdf_section)
 285                 s = e + 1
 286
 287             concat_pdfs(pdf, *pdf_sections)
 288
 289     def make_barcode_pdf(self, isbn, pdf, corner='br'):
 290         """Put an ISBN barcode in a corner of a single blank page."""
 291
 292         position = '%s,%s,%s,%s,%s' %(corner, self.width, self.height, self.side_margin, self.bottom_margin)
 293         cmd1 = [config.BOOKLAND,
 294                 '--position', position,
 295                 str(isbn)]
 296         cmd2 = ['ps2pdf',
 297                 '-dFIXEDMEDIA',
 298                 '-dDEVICEWIDTHPOINTS=%s' % self.width,
 299                 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
 300                 '-', pdf]
 301
 302         p1 = Popen(cmd1, stdout=PIPE)
 303         p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
 304         out, err = p2.communicate()
 305
 306         log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
 307         log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
 308
 309
 310
 311
 312 def concat_pdfs(name, *args):
 313     """Join all the named pdfs together into one and save it as <name>"""
 314     cmd = ['pdftk']
 315     cmd.extend(x for x in args if x is not None)
 316     cmd += ['cat', 'output', name]
 317     run(cmd)
 318
 319 def index_pdf(pdf, text=None):
 320     """Use pdftotext to extract utf-8 text from a pdf, using ^L to
 321     separate pages."""
 322     if text is None:
 323         text = pdf + '.index.txt'
 324     cmd = ['pdftotext',
 325            #'-layout', #keeps more original formatting
 326            pdf,
 327            text]
 328     run(cmd)
 329     return text
 330
 331 def rotate_pdf(pdfin, pdfout):
 332     """Turn the PDF on its head"""
 333     cmd = ['pdftk', pdfin,
 334            'cat',
 335            '1-endD',
 336            'output',
 337            pdfout
 338            ]
 339     run(cmd)
 340
 341
 342 class Book(object):
 343     page_numbers = 'latin'
 344     preamble_page_numbers = 'roman'
 345     engine= 'webkit'
 346     _try_cleanup_on_del = True
 347
 348     def notify_watcher(self, message=None):
 349         if self.watcher:
 350             if  message is None:
 351                 #message is the name of the caller
 352                 #XXX look at using inspect module
 353                 import traceback
 354                 message = traceback.extract_stack(None, 2)[0][2]
 355             log("notify_watcher called with '%s'" % message)
 356             self.watcher(message)
 357
 358     def __enter__(self):
 359         return self
 360
 361     def __exit__(self, exc_type, exc_value, traceback):
 362         self.cleanup()
 363         #could deal with exceptions here and return true
 364
 365     def __init__(self, book, server, bookname,
 366                  page_settings=None, engine=None, watcher=None, isbn=None,
 367                  license=config.DEFAULT_LICENSE):
 368         log("*** Starting new book %s ***" % bookname)
 369         self.book = book
 370         self.server = server
 371         self.watcher = watcher
 372         self.isbn = isbn
 373         self.license = license
 374         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
 375         os.chmod(self.workdir, 0755)
 376         defaults = SERVER_DEFAULTS.get(server, SERVER_DEFAULTS[DEFAULT_SERVER])
 377         self.default_css = defaults['css']
 378         self.lang = defaults['lang']
 379         self.dir  = defaults['dir']
 380
 381         self.body_html_file = self.filepath('body.html')
 382         self.body_pdf_file = self.filepath('body.pdf')
 383         self.body_index_file = self.filepath('body.txt')
 384         self.preamble_html_file = self.filepath('preamble.html')
 385         self.preamble_pdf_file = self.filepath('preamble.pdf')
 386         self.tail_html_file = self.filepath('tail.html')
 387         self.tail_pdf_file = self.filepath('tail.pdf')
 388         self.isbn_pdf_file = None
 389         self.pdf_file = self.filepath('final.pdf')
 390
 391         self.publish_name = bookname
 392         self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
 393         self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
 394
 395         self.book_url = config.BOOK_URL % (self.server, self.book)
 396         self.toc_url = config.TOC_URL % (self.server, self.book)
 397
 398         self.set_page_dimensions(page_settings)
 399
 400         if engine is not None:
 401             self.engine = engine
 402         self.notify_watcher()
 403
 404     def __del__(self):
 405         if os.path.exists(self.workdir) and self._try_cleanup_on_del:
 406             self._try_cleanup_on_del = False #or else you can get in bad cycles
 407             self.cleanup()
 408
 409     def __getattr__(self, attr):
 410         """catch unloaded books and load them"""
 411         #log('looking for missing attribute "%s"' % (attr))
 412         if attr == 'tree':
 413             self.load_book()
 414             return self.tree
 415         if attr == 'toc':
 416             self.load_toc()
 417             return self.toc
 418         raise AttributeError("no such member: '%s'" % attr)
 419
 420
 421     def filepath(self, fn):
 422         return os.path.join(self.workdir, fn)
 423
 424     def save_data(self, fn, data):
 425         """Save without tripping up on unicode"""
 426         if isinstance(data, unicode):
 427             data = data.encode('utf8', 'ignore')
 428         f = open(fn, 'w')
 429         f.write(data)
 430         f.close()
 431
 432     def save_tempfile(self, fn, data):
 433         """Save the data in a temporary directory that will be cleaned
 434         up when all is done.  Return the absolute file path."""
 435         fn = self.filepath(fn)
 436         self.save_data(fn, data)
 437         return fn
 438
 439     def set_page_dimensions(self, dimensions):
 440         self.maker = PageSettings(**dimensions)
 441
 442
 443     def extract_pdf_text(self):
 444         """Extract the text from the body pdf, split into pages, so
 445         that the correct page can be found to generate the table of
 446         contents."""
 447         index_pdf(self.body_pdf_file, self.body_index_file)
 448         f = open(self.body_index_file)
 449         s = unicode(f.read(), 'utf8')
 450         f.close()
 451         #pages are spearated by formfeed character "^L", "\f" or chr(12)
 452         self.text_pages = s.split("\f")
 453         #there is sometimes (probably always) an unwanted ^L at the end
 454         return len(self.text_pages)
 455
 456     def make_body_pdf(self):
 457         """Make a pdf of the HTML, using webkit"""
 458         #1. Save the html
 459         html_text = lxml.etree.tostring(self.tree, method="html")
 460         self.save_data(self.body_html_file, html_text)
 461
 462         #2. Make a pdf of it
 463         self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file,
 464                                 engine=self.engine)
 465         self.notify_watcher('generate_pdf')
 466
 467         #3. extract the text for finding contents.
 468         n_pages = self.extract_pdf_text()
 469         log ("found %s pages in pdf" % n_pages)
 470         #4. resize pages, shift gutters, even pages
 471         self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
 472         self.notify_watcher('reshape_pdf')
 473
 474         #5 add page numbers
 475         self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
 476                               numbers=self.page_numbers)
 477         self.notify_watcher("number_pdf")
 478         self.notify_watcher()
 479
 480     def make_preamble_pdf(self):
 481         contents = self.make_contents()
 482         inside_cover_html = self.compose_inside_cover()
 483         html = ('<html dir="%s"><head>\n'
 484                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 485                 '<link rel="stylesheet" href="%s" />\n'
 486                 '</head>\n<body>\n'
 487                 '<h1 class="frontpage">%s</h1>'
 488                 '%s\n'
 489                 '<div class="contents">%s</div>\n'
 490                 '<div style="page-break-after: always; color:#fff" class="unseen">.'
 491                 '<!--%s--></div></body></html>'
 492                 ) % (self.dir, self.css_url, self.title, inside_cover_html,
 493                      contents, self.title)
 494         self.save_data(self.preamble_html_file, html)
 495
 496         self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file,
 497                                 engine=self.engine)
 498
 499         self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
 500
 501         self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
 502                             numbers=self.preamble_page_numbers,
 503                             number_start=-2)
 504
 505         self.notify_watcher()
 506
 507     def make_end_matter_pdf(self):
 508         """Make an inside back cover and a back cover.  If there is an
 509         isbn number its barcode will be put on the back cover."""
 510         if self.isbn:
 511             self.isbn_pdf_file = self.filepath('isbn.pdf')
 512             self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
 513             self.notify_watcher('make_barcode_pdf')
 514
 515         self.save_data(self.tail_html_file, self.compose_end_matter())
 516         self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file,
 517                                 engine=self.engine)
 518
 519         self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
 520                                centre_end=True, even_pages=False)
 521         self.notify_watcher()
 522
 523     def make_book_pdf(self):
 524         """A convenient wrapper of a few necessary steps"""
 525         # now the Xvfb server is needed. make sure it has had long enough to get going
 526         self.wait_for_xvfb()
 527         self.make_body_pdf()
 528         self.make_preamble_pdf()
 529         self.make_end_matter_pdf()
 530
 531         concat_pdfs(self.pdf_file, self.preamble_pdf_file,
 532                     self.body_pdf_file, self.tail_pdf_file,
 533                     self.isbn_pdf_file)
 534
 535         self.notify_watcher('concatenated_pdfs')
 536
 537
 538     def make_simple_pdf(self, mode):
 539         """Make a simple pdf document without contents or separate
 540         title page.  This is used for multicolumn newspapers and for
 541         web-destined pdfs."""
 542         self.wait_for_xvfb()
 543         #0. Add heading to begining of html
 544         body = list(self.tree.cssselect('body'))[0]
 545         e = body.makeelement('h1', {'id': 'book-title'})
 546         e.text = self.title
 547         body.insert(0, e)
 548         intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
 549         e.addnext(intro)
 550
 551         #0.5 adjust parameters to suit the particular kind of output
 552         if mode == 'web':
 553             self.maker.gutter = 0
 554
 555         #1. Save the html
 556         html_text = lxml.etree.tostring(self.tree, method="html")
 557         self.save_data(self.body_html_file, html_text)
 558
 559         #2. Make a pdf of it (direct to to final pdf)
 560         self.maker.make_raw_pdf(self.body_html_file, self.pdf_file,
 561                                 engine=self.engine, outline=True)
 562         self.notify_watcher('generate_pdf')
 563
 564         if mode != 'web':
 565             #3. resize pages and shift gutters.
 566             self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
 567             self.notify_watcher('reshape_pdf')
 568
 569             #4. add page numbers
 570             self.maker.number_pdf(self.pdf_file, None, dir=self.dir,
 571                                   numbers=self.page_numbers)
 572             self.notify_watcher("number_pdf")
 573         self.notify_watcher()
 574
 575
 576     def rotate180(self):
 577         """Rotate the pdf 180 degrees so an RTL book can print on LTR
 578         presses."""
 579         rotated = self.filepath('final-rotate.pdf')
 580         unrotated = self.filepath('final-pre-rotate.pdf')
 581         #leave the unrotated pdf intact at first, in case of error.
 582         rotate_pdf(self.pdf_file, rotated)
 583         os.rename(self.pdf_file, unrotated)
 584         os.rename(rotated, self.pdf_file)
 585         self.notify_watcher()
 586
 587     def publish_pdf(self):
 588         """Move the finished PDF to its final resting place"""
 589         log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
 590         os.rename(self.pdf_file, self.publish_file)
 591         self.notify_watcher()
 592
 593     def load_toc(self):
 594         """From the TOC.txt file create a list of TocItems with
 595         the attributes <status>, <chapter>, and <title>.
 596
 597         <status> is a number, with the following meaning:
 598
 599               0 - section heading with no chapter
 600               1 - chapter heading
 601               2 - book title
 602
 603         The TocItem object has convenience functions <is_chapter> and
 604         <is_section>.
 605
 606         <chapter> is twiki name of the chapter.
 607
 608         <title> is a human readable title for the chapter.  It is likely to
 609         differ from the title given in the chapter's <h1> heading.
 610         """
 611         f = urlopen(self.toc_url)
 612         self.toc = []
 613         while True:
 614             try:
 615                 self.toc.append(TocItem(f.next().strip(),
 616                                         f.next().strip(),
 617                                         f.next().strip()))
 618             except StopIteration:
 619                 break
 620         f.close()
 621         self.notify_watcher()
 622
 623     def load_book(self, tidy=True):
 624         """Fetch and parse the raw html of the book.  If tidy is true
 625         (default) links in the document will be made absolute."""
 626         f = urlopen(self.book_url)
 627         html = f.read()
 628         f.close()
 629         html = ('<html dir="%s"><head>\n<title>%s</title>\n'
 630                 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
 631                 '</head>\n<body>\n'
 632                 '%s\n'
 633                 '<div style="page-break-before: always; color:#fff;" class="unseen">'
 634                 'A FLOSSManuals book</div>\n</body></html>'
 635                 ) % (self.dir, self.book, html)
 636
 637         self.save_tempfile('raw.html', html)
 638
 639         tree = lxml.html.document_fromstring(html)
 640         if tidy:
 641             tree.make_links_absolute(self.book_url)
 642         self.tree = tree
 643         self.headings = [x for x in tree.cssselect('h1')]
 644         if self.headings:
 645             self.headings[0].set('class', "first-heading")
 646         #self.heading_texts = [x.textcontent() for x in self.headings]
 647         for h1 in self.headings:
 648             h1.title = h1.text_content().strip()
 649         self.notify_watcher()
 650
 651
 652     def load(self):
 653         """Wrapper around all necessary load methods."""
 654         self.load_book()
 655         self.load_toc()
 656
 657     def find_page(self, element, start_page=1):
 658         """Search through a page iterator and return the page
 659         number which the element probably occurs."""
 660         text = element.cookie
 661         for i, content in enumerate(self.text_pages[start_page - 1:]):
 662             log("looking for '%s' in page %s below:\n%s[...]" %
 663                 (text, i + start_page, content[:160]), debug='INDEX')
 664             #remove spaces: they can appear spuriously
 665             content = ''.join(content.split())
 666             if text in content:
 667                 return i + start_page, True
 668         #If it isn't found, return the start page so the next chapter has a chance
 669         return start_page, False
 670
 671     def make_contents(self):
 672         """Generate HTML containing the table of contents.  This can
 673         only be done after the main PDF has been made."""
 674         header = '<h1>Table of Contents</h1><table class="toc">\n'
 675         row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
 676                     '<td class="pagenumber">%s</td></tr>\n')
 677         section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
 678         footer = '\n</table>'
 679
 680         contents = []
 681
 682         chapter = 1
 683         page_num = 1
 684         subsections = [] # for the subsection heading pages.
 685
 686         headings = iter(self.headings)
 687
 688         for t in self.toc:
 689             if t.is_chapter():
 690                 try:
 691                     h1 = headings.next()
 692                 except StopIteration:
 693                     log("heading not found for %s (previous h1 missing?). Stopping" % t)
 694                     break
 695                 page_num, found = self.find_page(h1, page_num)
 696                 # sometimes the heading isn't found, which is shown as a frown
 697                 if found:
 698                     contents.append(row_tmpl % (chapter, h1.title, page_num))
 699                 else:
 700                     contents.append(row_tmpl % (chapter, h1.title, ':-('))
 701                 chapter += 1
 702             elif t.is_section():
 703                 contents.append(section_tmpl % t.title)
 704             else:
 705                 log("mystery TOC item: %s" % t)
 706
 707         doc = header + '\n'.join(contents) + footer
 708         self.notify_watcher()
 709         return doc
 710
 711     def add_section_titles(self):
 712         """Add any section heading pages that the TOC.txt file
 713         specifies.  These are sub-book, super-chapter groupings.
 714
 715         Also add initial numbers to chapters.
 716         """
 717         headings = iter(self.headings)
 718         chapter = 1
 719         section = None
 720
 721         for t in self.toc:
 722             if t.is_chapter() and section is not None:
 723                 try:
 724                     h1 = headings.next()
 725                 except StopIteration:
 726                     log("heading not found for %s (previous h1 missing?)" % t)
 727                     break
 728                 item = h1.makeelement('div', Class='chapter')
 729                 log(h1.title, debug='HTMLGEN')
 730                 item.text = h1.title
 731                 _add_initial_number(item, chapter)
 732
 733                 section.append(item)
 734
 735                 if not section_placed:
 736                     log("placing section", debug='HTMLGEN')
 737                     h1.addprevious(section)
 738                     section_placed = True
 739                 else:
 740                     log("NOT placing section", debug='HTMLGEN')
 741
 742                 #put a bold number at the beginning of the h1, and a hidden cookie at the end.
 743                 _add_initial_number(h1, chapter)
 744                 _add_chapter_cookie(h1)
 745                 chapter += 1
 746
 747             elif t.is_section():
 748                 section = self.tree.makeelement('div', Class="subsection")
 749                 # section Element complains when you try to ask it whether it
 750                 # has been placed (though it does know)
 751                 section_placed = False
 752                 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
 753                 heading.set("Class", "subsection-heading")
 754                 section.append(heading)
 755
 756         self.notify_watcher()
 757
 758
 759     def add_css(self, css=None, mode='book'):
 760         """If css looks like a url, use it as a stylesheet link.
 761         Otherwise it is the CSS itself, which is saved to a temporary file
 762         and linked to."""
 763         log("css is %r" % css)
 764         htmltree = self.tree
 765         if css is None or not css.strip():
 766             defaults = SERVER_DEFAULTS[self.server]
 767             url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
 768         elif not re.match(r'^http://\S+$', css):
 769             fn = self.save_tempfile('objavi.css', css)
 770             url = 'file://' + fn
 771         else:
 772             url = css
 773         #XXX for debugging and perhaps sensible anyway
 774         #url = url.replace('file:///home/douglas/objavi2', '')
 775
 776
 777         #find the head -- it's probably first child but lets not assume.
 778         for child in htmltree:
 779             if child.tag == 'head':
 780                 head = child
 781                 break
 782         else:
 783             head = htmltree.makeelement('head')
 784             htmltree.insert(0, head)
 785
 786         link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
 787         self.css_url = url
 788         self.notify_watcher()
 789         return url
 790
 791     def set_title(self, title=None):
 792         """If a string is supplied, it becomes the book's title.
 793         Otherwise a guess is made."""
 794         if title:
 795             self.title = title
 796         else:
 797             titles = [x.text_content() for x in self.tree.cssselect('title')]
 798             if titles and titles[0]:
 799                 self.title = titles[0]
 800             else:
 801                 #oh well
 802                 self.title = 'A Manual About ' + self.book
 803         return self.title
 804
 805     def _read_localised_template(self, template, fallbacks=['en']):
 806         """Try to get the template in the approriate language, otherwise in english."""
 807         for lang in [self.lang] + fallbacks:
 808             try:
 809                 fn = template % (lang)
 810                 f = open(fn)
 811                 break
 812             except IOError, e:
 813                 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
 814                 log(e)
 815         template = f.read()
 816         f.close()
 817         return template
 818
 819     def compose_inside_cover(self):
 820         """create the markup for the preamble inside cover."""
 821         template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
 822
 823         if self.isbn:
 824             isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
 825         else:
 826             isbn_text = ''
 827
 828         return template % {'date': time.strftime('%Y-%m-%d'),
 829                            'isbn': isbn_text,
 830                            'license': self.license,
 831                            }
 832
 833
 834     def compose_end_matter(self):
 835         """create the markup for the end_matter inside cover.  If
 836         self.isbn is not set, the html will result in a pdf that
 837         spills onto two pages.
 838         """
 839         template = self._read_localised_template(config.END_MATTER_TEMPLATE)
 840
 841         d = {'css_url': self.css_url,
 842              'title': self.title
 843              }
 844
 845         if self.isbn:
 846             d['inside_cover_style'] = ''
 847         else:
 848             d['inside_cover_style'] = 'page-break-after: always'
 849
 850         return template % d
 851
 852
 853
 854
 855     def spawn_x(self):
 856         """Start an Xvfb instance, using a new server number.  A
 857         reference to it is stored in self.xvfb, which is used to kill
 858         it when the pdf is done.
 859
 860         Note that Xvfb doesn't interact well with dbus which is
 861         present on modern desktops.
 862         """
 863         #Find an unused server number (in case two cgis are running at once)
 864         while True:
 865             servernum = random.randrange(50, 500)
 866             if not os.path.exists('/tmp/.X%s-lock' % servernum):
 867                 break
 868
 869         self.xserver_no = ':%s' % servernum
 870
 871         authfile = self.filepath('Xauthority')
 872         os.environ['XAUTHORITY'] = authfile
 873
 874         #mcookie(1) eats into /dev/random, so avoid that
 875         from hashlib import md5
 876         m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
 877         mcookie = m.hexdigest()
 878
 879         check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
 880
 881         self.xvfb = Popen(['Xvfb', self.xserver_no,
 882                            '-screen', '0', '1024x768x24',
 883                            '-pixdepths', '32',
 884                            #'-blackpixel', '0',
 885                            #'-whitepixel', str(2 ** 24 -1),
 886                            #'+extension', 'Composite',
 887                            '-dpi', '96',
 888                            '-kb',
 889                            '-nolisten', 'tcp',
 890                            ])
 891
 892         # We need to wait a bit before the Xvfb is ready.  but the
 893         # downloads are so slow that that probably doesn't matter
 894
 895         self.xvfb_ready_time = time.time() + 2
 896
 897         os.environ['DISPLAY'] = self.xserver_no
 898         log(self.xserver_no)
 899
 900     def wait_for_xvfb(self):
 901         """wait until a previously set time before continuing.  This
 902         is so Xvfb has time to properly start."""
 903         if hasattr(self, 'xvfb'):
 904             d = self.xvfb_ready_time - time.time()
 905             if d > 0:
 906                 time.sleep(d)
 907                 self.notify_watcher()
 908
 909     def cleanup_x(self):
 910         """Try very hard to kill off Xvfb.  In addition to killing
 911         this instance's xvfb, occasionally (randomly) search for
 912         escaped Xvfb instances and kill those too."""
 913         if not hasattr(self, 'xvfb'):
 914             return
 915         check_call(['xauth', 'remove', self.xserver_no])
 916         p = self.xvfb
 917         log("trying to kill Xvfb %s" % p.pid)
 918         os.kill(p.pid, 15)
 919         for i in range(10):
 920             if p.poll() is not None:
 921                 log("%s died with %s" % (p.pid, p.poll()))
 922                 break
 923             log("%s not dead yet" % p.pid)
 924             time.sleep(0.2)
 925         else:
 926             log("Xvfb would not die! kill -9! kill -9!")
 927             os.kill(p.pid, 9)
 928
 929         if random.random() < 0.05:
 930             #kill old xvfbs occasionally, if there are any.
 931             self.kill_old_xvfbs()
 932
 933     def kill_old_xvfbs(self):
 934         """Sometimes, despite everything, Xvfb instances hang around
 935         well after they are wanted -- for example if the cgi process
 936         dies particularly badly. So kill them if they have been
 937         running for a long time."""
 938         log("running kill_old_xvfbs")
 939         p = Popen(['ps', '-C' 'Xvfb', '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
 940         data = p.communicate()[0].strip()
 941         if data:
 942             lines = data.split('\n')
 943             for line in lines:
 944                 log('dealing with ps output "%s"' % line)
 945                 try:
 946                     pid, days_, hours, minutes, seconds = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$').groups()
 947                 except AttributeError:
 948                     log("Couldn't parse that line!")
 949                 # 50 minutes should be enough xvfb time for anyone
 950                 if days or hours or int(minutes) > 50:
 951                     log("going to kill pid %s" % pid)
 952                     os.kill(int(pid), 15)
 953                     time.sleep(0.5)
 954                     os.kill(int(pid), 9)
 955         self.notify_watcher()
 956
 957     def cleanup(self):
 958         self.cleanup_x()
 959         if not config.KEEP_TEMP_FILES:
 960             for fn in os.listdir(self.workdir):
 961                 os.remove(os.path.join(self.workdir, fn))
 962             os.rmdir(self.workdir)
 963         else:
 964             log("NOT removing '%s', containing the following files:" % self.workdir)
 965             log(*os.listdir(self.workdir))
 966
 967         self.notify_watcher()
 968
 969