extract_pdf_outline doesn't work on columned pages
[objavi2.git] / fmbook.py
blob184a4cd1e11cdc510d76c6829860cf9a52dbb2ad
1 # Part of Objavi2, which turns html manuals into books.
2 # This provides abstractions of texts and virtual printers and manages
3 # their interactions.
5 # Copyright (C) 2009 Douglas Bagnall
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License along
18 # with this program; if not, write to the Free Software Foundation, Inc.,
19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 """Library module representing a complete FM book being turned into a
22 PDF"""
24 import os, sys
25 import tempfile
26 import re, time
27 import random
28 from urllib2 import urlopen
29 from subprocess import Popen, check_call, PIPE
31 import lxml.etree, lxml.html
32 import lxml, lxml.html, lxml.etree
34 import config
35 from config import SERVER_DEFAULTS, DEFAULT_SERVER, POINT_2_MM, PDFEDIT_MAX_PAGES
37 TMPDIR = os.path.abspath(config.TMPDIR)
38 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
39 PUBLISH_PATH = "%s/books/" % DOC_ROOT
42 def log(*messages, **kwargs):
43 """Send the messages to the appropriate place (stderr, or syslog).
44 If a <debug> keyword is specified, the message is only printed if
45 its value ias in the global DEBUG_MODES."""
46 if 'debug' not in kwargs or config.DEBUG_ALL or kwargs['debug'] in config.DEBUG_MODES:
47 for m in messages:
48 try:
49 print >> sys.stderr, m
50 except Exception:
51 print >> sys.stderr, repr(m)
53 def _add_initial_number(e, n):
54 """Put a styled chapter number n at the beginning of element e."""
55 initial = e.makeelement("strong", Class="initial")
56 e.insert(0, initial)
57 initial.tail = ' '
58 if e.text is not None:
59 initial.tail += e.text
60 e.text = ''
61 initial.text = "%s." % n
64 class TocItem(object):
65 """This makes sense of the tuples from TOC.txt files"""
66 def __init__(self, status, chapter, title):
67 # status is
68 # 0 - section heading with no chapter
69 # 1 - chapter heading
70 # 2 - book title
72 # chapter is twiki name of the chapter
73 # title is a human readable name of the chapter.
74 self.status = status
75 self.chapter = chapter
76 self.title = title
78 def is_chapter(self):
79 return self.status == '1'
81 def is_section(self):
82 return self.status == '0'
84 def __str__(self):
85 return '<toc: %s>' % ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
88 def run(cmd):
89 try:
90 p = Popen(cmd, stdout=PIPE, stderr=PIPE)
91 out, err = p.communicate()
92 except Exception:
93 log("Failed on command: %r" % cmd)
94 raise
95 log("%s\n%s returned %s and produced\nstdout:%s\nstderr:%s" %
96 (' '.join(cmd), cmd[0], p.poll(), out, err))
99 def find_containing_paper(w, h):
100 size = None
101 for name, pw, ph in config.PAPER_SIZES:
102 if pw >= w and ph >= h:
103 mw = (pw - w) * 0.5
104 mh = (ph - h) * 0.5
105 return (name, mw, mh)
107 raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
108 (w * POINT_2_MM, h * POINT_2_MM))
112 class PageSettings(object):
113 """Calculates and wraps commands for the generation and processing
114 of PDFs"""
115 def __init__(self, pointsize, **kwargs):
116 # the formulas for default gutters, margins and column margins
117 # are quite ad-hoc and certainly improvable.
119 self.width, self.height = pointsize
120 self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
121 self.grey_scale = 'grey_scale' in kwargs
123 # All measurements in points unless otherwise stated
124 # user interaction is in *mm*, but is converted in objavi2.py
125 default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
126 default_gutter = (config.BASE_GUTTER + config.PROPORTIONAL_GUTTER * self.width)
128 self.top_margin = kwargs.get('top_margin', default_margin)
129 self.side_margin = kwargs.get('side_margin', default_margin)
130 self.bottom_margin = kwargs.get('bottom_margin', default_margin)
131 self.gutter = kwargs.get('gutter', default_gutter)
133 self.columns = kwargs.get('columns', 1)
134 if self.columns == 'auto': #default for newspapers is to work out columns
135 self.columns = int(self.width // config.MIN_COLUMN_WIDTH)
137 self.column_margin = kwargs.get('column_margin',
138 default_margin * 2 / (5.0 + self.columns))
140 self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
141 self.number_margin = self.side_margin
143 # calculate margins in mm for browsers
144 self.margins = []
145 for m, clip in ((self.top_margin, clipy),
146 (self.side_margin, clipx + 0.5 * self.gutter),
147 (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
148 (self.side_margin, clipx + 0.5 * self.gutter),
150 self.margins.append((m + clip) * POINT_2_MM)
152 self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
154 if 'PDFGEN' in config.DEBUG_MODES:
155 log("making PageSettings with:")
156 for x in locals().iteritems():
157 log("%s: %s" % x, debug='PDFGEN')
158 for x in dir(self):
159 if not x.startswith('__'):
160 log("self.%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
164 def _webkit_command(self, html, pdf, outline=False):
165 m = [str(x) for x in self.margins]
166 outline_args = ['--outline'] * outline
167 greyscale_args = ['-g'] * self.grey_scale
168 cmd = ([config.WKHTMLTOPDF, '-q', '-s', self.papersize,
169 '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
170 '-d', '100'] + outline_args + greyscale_args +
171 config.WKHTMLTOPDF_EXTRA_COMMANDS + [html, pdf])
172 log(' '.join(cmd))
173 return cmd
175 def _gecko_command(self, html, pdf, outline=False):
176 m = [str(x) for x in self.margins]
177 #firefox -P pdfprint -print URL -printprinter "printer_settings"
178 cmd = [config.FIREFOX, '-P', 'pdfprint', '-print',
179 html, '-printprinter', self.moz_printer]
180 log(' '.join(cmd))
181 return cmd
183 def make_raw_pdf(self, html, pdf, engine='webkit', outline=False):
184 func = getattr(self, '_%s_command' % engine)
185 if self.columns == 1:
186 cmd = func(html, pdf, outline=outline)
187 run(cmd)
188 else:
189 printable_width = self.width - 2.0 * self.side_margin - self.gutter
190 column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
191 page_width = column_width + self.column_margin
192 side_margin = self.column_margin * 0.5
193 if 'PDFGEN' in config.DEBUG_MODES:
194 log("making columns with:")
195 for k, v in locals().iteritems():
196 log("%s: %r" % (k, v))
197 for k in ('width', 'side_margin', 'gutter', 'column_margin', 'columns', 'height'):
198 log("self.%s: %r" % (k, getattr(self, k)))
200 columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
201 gutter=0, top_margin=self.top_margin,
202 side_margin=side_margin,
203 bottom_margin=self.bottom_margin,
204 grey_scale=self.grey_scale,
207 column_pdf = pdf[:-4] + '-single-column.pdf'
208 columnmaker.make_raw_pdf(html, column_pdf, engine=engine, outline=outline)
209 columnmaker.reshape_pdf(column_pdf)
211 cmd = ['pdfnup',
212 '--nup', '%sx1' % int(self.columns),
213 '--paper', self.papersize.lower() + 'paper',
214 '--outfile', pdf,
215 '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
216 '--noautoscale', 'true',
217 '--orient', 'portrait',
218 #'--tidy', 'false',
219 column_pdf
222 run(cmd)
226 def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
227 even_pages=True):
228 """Spin the pdf for RTL text, resize it to the right size, and
229 shift the gutter left and right"""
230 ops = 'resize'
231 if self.gutter:
232 ops += ',shift'
233 if even_pages:
234 ops += ',even_pages'
235 gutter = self.gutter
236 if dir == 'RTL':
237 gutter = -gutter
238 cmd = ['pdfedit', '-s', 'wk_objavi.qs',
239 'dir=%s' % dir,
240 'filename=%s' % pdf,
241 'output_filename=%s' % pdf,
242 'operation=%s' % ops,
243 'width=%s' % self.width,
244 'height=%s' % self.height,
245 'offset=%s' % gutter,
246 'centre_start=%s' % centre_start,
247 'centre_end=%s' % centre_end,
249 run(cmd)
251 def _number_pdf(self, pdf, numbers='latin', dir='LTR',
252 number_start=1):
253 cmd = ['pdfedit', '-s', 'wk_objavi.qs',
254 'operation=page_numbers',
255 'dir=%s' % dir,
256 'filename=%s' % pdf,
257 'output_filename=%s' % pdf,
258 'number_start=%s' % number_start,
259 'number_style=%s' % numbers,
260 'number_bottom=%s' % self.number_bottom,
261 'number_margin=%s' % self.number_margin,
263 run(cmd)
265 def number_pdf(self, pdf, pages, **kwargs):
266 # if there are too many pages for pdfedit to handle in one go,
267 # split the job into bits. <pages> may not be exact
268 if pages is None or pages <= PDFEDIT_MAX_PAGES:
269 self._number_pdf(pdf, **kwargs)
270 else:
271 # section_size must be even
272 sections = pages // PDFEDIT_MAX_PAGES + 1
273 section_size = (pages // sections + 2) & ~1
275 pdf_sections = []
276 s = kwargs.pop('number_start', 1)
277 while s < pages:
278 e = s + section_size - 1
279 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
280 if e < pages - 1:
281 page_range = '%s-%s' % (s, e)
282 else:
283 page_range = '%s-end' % s
284 run(['pdftk',
285 pdf,
286 'cat',
287 page_range,
288 'output',
289 pdf_section,
291 self._number_pdf(pdf_section, number_start=s, **kwargs)
292 pdf_sections.append(pdf_section)
293 s = e + 1
295 concat_pdfs(pdf, *pdf_sections)
297 def make_barcode_pdf(self, isbn, pdf, corner='br'):
298 """Put an ISBN barcode in a corner of a single blank page."""
300 position = '%s,%s,%s,%s,%s' %(corner, self.width, self.height, self.side_margin, self.bottom_margin)
301 cmd1 = [config.BOOKLAND,
302 '--position', position,
303 str(isbn)]
304 cmd2 = ['ps2pdf',
305 '-dFIXEDMEDIA',
306 '-dDEVICEWIDTHPOINTS=%s' % self.width,
307 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
308 '-', pdf]
310 p1 = Popen(cmd1, stdout=PIPE)
311 p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
312 out, err = p2.communicate()
314 log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
315 log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
318 def count_pdf_pages(pdf):
319 """How many pages in the PDF?"""
320 #XXX could also use python-pypdf or python-poppler
321 cmd = ('pdfinfo', pdf)
322 p = Popen(cmd, stdout=PIPE, stderr=PIPE)
323 out, err = p.communicate()
324 m = re.search(r'^\s*Pages:\s*(\d+)\s*$', re.MULTILINE)
325 return int(m.group(1))
328 def concat_pdfs(destination, *pdfs):
329 """Join all the named pdfs together into one and save it as <name>"""
330 cmd = ['pdftk']
331 cmd.extend(x for x in pdfs if x is not None)
332 cmd += ['cat', 'output', destination]
333 run(cmd)
335 def index_pdf(pdf, text=None):
336 """Use pdftotext to extract utf-8 text from a pdf, using ^L to
337 separate pages."""
338 if text is None:
339 text = pdf + '.index.txt'
340 cmd = ['pdftotext',
341 #'-layout', #keeps more original formatting
342 pdf,
343 text]
344 run(cmd)
345 return text
347 def rotate_pdf(pdfin, pdfout):
348 """Turn the PDF on its head"""
349 cmd = ['pdftk', pdfin,
350 'cat',
351 '1-endD',
352 'output',
353 pdfout
355 run(cmd)
357 def parse_outline(pdf, level_threshold):
358 """Create a structure reflecting the outline of a PDF.
359 A chapter heading looks like this:
361 BookmarkTitle: 2. What is sound?
362 BookmarkLevel: 1
363 BookmarkPageNumber: 3
365 cmd = ('pdftk', pdf, 'dump_data')
366 p = Popen(cmd, stdout=PIPE, stderr=PIPE)
367 outline, err = p.communicate()
368 lines = (x.strip() for x in outline.split('\n') if x.strip())
369 contents = []
371 def extract(expected, conv=str.strip):
372 line = lines.next()
373 try:
374 k, v = line.split(':', 1)
375 if k == expected:
376 return conv(v)
377 except ValueError:
378 log("trouble with line %r" %line)
380 #There are a few useless variables, then the pagecount, then the contents.
381 #The pagecount is useful, so pick it up first.
382 page_count = None
383 while page_count == None:
384 page_count = extract('NumberOfPages', int)
386 try:
387 while True:
388 title = extract('BookmarkTitle')
389 if title is not None:
390 level = extract('BookmarkLevel', int)
391 pagenum = extract('BookmarkPageNumber', int)
392 if level <= level_threshold and None not in (level, pagenum):
393 contents.append((title, level, pagenum))
394 except StopIteration:
395 pass
397 return contents, outline, page_count
400 class Book(object):
401 page_numbers = 'latin'
402 preamble_page_numbers = 'roman'
403 engine= 'webkit'
404 _try_cleanup_on_del = config.TRY_BOOK_CLEANUP_ON_DEL
406 def notify_watcher(self, message=None):
407 if self.watcher:
408 if message is None:
409 #message is the name of the caller
410 #XXX look at using inspect module
411 import traceback
412 message = traceback.extract_stack(None, 2)[0][2]
413 log("notify_watcher called with '%s'" % message)
414 self.watcher(message)
416 def __enter__(self):
417 return self
419 def __exit__(self, exc_type, exc_value, traceback):
420 self.cleanup()
421 #could deal with exceptions here and return true
423 def __init__(self, book, server, bookname,
424 page_settings=None, engine=None, watcher=None, isbn=None,
425 license=config.DEFAULT_LICENSE):
426 log("*** Starting new book %s ***" % bookname)
427 self.book = book
428 self.server = server
429 self.watcher = watcher
430 self.isbn = isbn
431 self.license = license
432 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
433 os.chmod(self.workdir, 0755)
434 defaults = SERVER_DEFAULTS[server]
435 self.lang = defaults['lang']
436 self.dir = defaults['dir']
438 self.body_html_file = self.filepath('body.html')
439 self.body_pdf_file = self.filepath('body.pdf')
440 self.body_index_file = self.filepath('body.txt')
441 self.preamble_html_file = self.filepath('preamble.html')
442 self.preamble_pdf_file = self.filepath('preamble.pdf')
443 self.tail_html_file = self.filepath('tail.html')
444 self.tail_pdf_file = self.filepath('tail.pdf')
445 self.isbn_pdf_file = None
446 self.pdf_file = self.filepath('final.pdf')
448 self.publish_name = bookname
449 self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
450 self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
452 self.book_url = config.BOOK_URL % (self.server, self.book)
453 self.toc_url = config.TOC_URL % (self.server, self.book)
455 self.maker = PageSettings(**page_settings)
457 if engine is not None:
458 self.engine = engine
459 self.notify_watcher()
461 if config.TRY_BOOK_CLEANUP_ON_DEL:
462 #Dont even define __del__ if it is not used.
463 _try_cleanup_on_del = True
464 def __del__(self):
465 if self._try_cleanup_on_del and os.path.exists(self.workdir):
466 self._try_cleanup_on_del = False #or else you can get in bad cycles
467 self.cleanup()
469 def __getattr__(self, attr):
470 """catch unloaded books and load them"""
471 #log('looking for missing attribute "%s"' % (attr))
472 if attr == 'tree':
473 self.load_book()
474 return self.tree
475 if attr == 'toc':
476 self.load_toc()
477 return self.toc
478 raise AttributeError("no such member: '%s'" % attr)
481 def filepath(self, fn):
482 return os.path.join(self.workdir, fn)
484 def save_data(self, fn, data):
485 """Save without tripping up on unicode"""
486 if isinstance(data, unicode):
487 data = data.encode('utf8', 'ignore')
488 f = open(fn, 'w')
489 f.write(data)
490 f.close()
492 def save_tempfile(self, fn, data):
493 """Save the data in a temporary directory that will be cleaned
494 up when all is done. Return the absolute file path."""
495 fn = self.filepath(fn)
496 self.save_data(fn, data)
497 return fn
499 def extract_pdf_outline(self):
500 self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
501 for x in self.outline_contents:
502 log(x)
503 return number_of_pages
505 def make_body_pdf(self):
506 """Make a pdf of the HTML, using webkit"""
507 #1. Save the html
508 html_text = lxml.etree.tostring(self.tree, method="html")
509 self.save_data(self.body_html_file, html_text)
511 #2. Make a pdf of it
512 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file,
513 engine=self.engine, outline=True)
514 self.notify_watcher('generate_pdf')
516 n_pages = self.extract_pdf_outline()
518 log ("found %s pages in pdf" % n_pages)
519 #4. resize pages, shift gutters, even pages
520 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
521 self.notify_watcher('reshape_pdf')
523 #5 add page numbers
524 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
525 numbers=self.page_numbers)
526 self.notify_watcher("number_pdf")
527 self.notify_watcher()
529 def make_preamble_pdf(self):
530 contents = self.make_contents()
531 inside_cover_html = self.compose_inside_cover()
532 html = ('<html dir="%s"><head>\n'
533 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
534 '<link rel="stylesheet" href="%s" />\n'
535 '</head>\n<body>\n'
536 '<h1 class="frontpage">%s</h1>'
537 '%s\n'
538 '<div class="contents">%s</div>\n'
539 '<div style="page-break-after: always; color:#fff" class="unseen">.'
540 '<!--%s--></div></body></html>'
541 ) % (self.dir, self.css_url, self.title, inside_cover_html,
542 contents, self.title)
543 self.save_data(self.preamble_html_file, html)
545 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file,
546 engine=self.engine)
548 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
550 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
551 numbers=self.preamble_page_numbers,
552 number_start=-2)
554 self.notify_watcher()
556 def make_end_matter_pdf(self):
557 """Make an inside back cover and a back cover. If there is an
558 isbn number its barcode will be put on the back cover."""
559 if self.isbn:
560 self.isbn_pdf_file = self.filepath('isbn.pdf')
561 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
562 self.notify_watcher('make_barcode_pdf')
564 self.save_data(self.tail_html_file, self.compose_end_matter())
565 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file,
566 engine=self.engine)
568 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
569 centre_end=True, even_pages=False)
570 self.notify_watcher()
572 def make_book_pdf(self):
573 """A convenient wrapper of a few necessary steps"""
574 # now the Xvfb server is needed. make sure it has had long enough to get going
575 self.wait_for_xvfb()
576 self.make_body_pdf()
577 self.make_preamble_pdf()
578 self.make_end_matter_pdf()
580 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
581 self.body_pdf_file, self.tail_pdf_file,
582 self.isbn_pdf_file)
584 self.notify_watcher('concatenated_pdfs')
587 def make_simple_pdf(self, mode):
588 """Make a simple pdf document without contents or separate
589 title page. This is used for multicolumn newspapers and for
590 web-destined pdfs."""
591 self.wait_for_xvfb()
592 #0. Add heading to begining of html
593 body = list(self.tree.cssselect('body'))[0]
594 e = body.makeelement('h1', {'id': 'book-title'})
595 e.text = self.title
596 body.insert(0, e)
597 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
598 e.addnext(intro)
600 #0.5 adjust parameters to suit the particular kind of output
601 if mode == 'web':
602 self.maker.gutter = 0
604 #1. Save the html
605 html_text = lxml.etree.tostring(self.tree, method="html")
606 self.save_data(self.body_html_file, html_text)
608 #2. Make a pdf of it (direct to to final pdf)
609 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file,
610 engine=self.engine, outline=True)
611 self.notify_watcher('generate_pdf')
612 #n_pages = self.extract_pdf_outline()
613 n_pages = count_pdf_pages(self.pdf_file)
615 if mode != 'web':
616 #3. resize pages and shift gutters.
617 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
618 self.notify_watcher('reshape_pdf')
620 #4. add page numbers
621 self.maker.number_pdf(self.pdf_file, n_pages,
622 dir=self.dir, numbers=self.page_numbers)
623 self.notify_watcher("number_pdf")
624 self.notify_watcher()
627 def rotate180(self):
628 """Rotate the pdf 180 degrees so an RTL book can print on LTR
629 presses."""
630 rotated = self.filepath('final-rotate.pdf')
631 unrotated = self.filepath('final-pre-rotate.pdf')
632 #leave the unrotated pdf intact at first, in case of error.
633 rotate_pdf(self.pdf_file, rotated)
634 os.rename(self.pdf_file, unrotated)
635 os.rename(rotated, self.pdf_file)
636 self.notify_watcher()
638 def publish_pdf(self):
639 """Move the finished PDF to its final resting place"""
640 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
641 os.rename(self.pdf_file, self.publish_file)
642 self.notify_watcher()
644 def load_toc(self):
645 """From the TOC.txt file create a list of TocItems with
646 the attributes <status>, <chapter>, and <title>.
648 <status> is a number, with the following meaning:
650 0 - section heading with no chapter
651 1 - chapter heading
652 2 - book title
654 The TocItem object has convenience functions <is_chapter> and
655 <is_section>.
657 <chapter> is twiki name of the chapter.
659 <title> is a human readable title for the chapter. It is likely to
660 differ from the title given in the chapter's <h1> heading.
662 f = urlopen(self.toc_url)
663 self.toc = []
664 while True:
665 try:
666 self.toc.append(TocItem(f.next().strip(),
667 f.next().strip(),
668 f.next().strip()))
669 except StopIteration:
670 break
671 f.close()
672 self.notify_watcher()
674 def load_book(self, tidy=True):
675 """Fetch and parse the raw html of the book. If tidy is true
676 (default) links in the document will be made absolute."""
677 f = urlopen(self.book_url)
678 html = f.read()
679 f.close()
680 html = ('<html dir="%s"><head>\n<title>%s</title>\n'
681 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
682 '</head>\n<body>\n'
683 '%s\n'
684 '<div style="page-break-before: always; color:#fff;" class="unseen">'
685 'A FLOSSManuals book</div>\n</body></html>'
686 ) % (self.dir, self.book, html)
688 self.save_tempfile('raw.html', html)
690 tree = lxml.html.document_fromstring(html)
691 if tidy:
692 tree.make_links_absolute(self.book_url)
693 self.tree = tree
694 self.headings = [x for x in tree.cssselect('h1')]
695 if self.headings:
696 self.headings[0].set('class', "first-heading")
697 for h1 in self.headings:
698 h1.title = h1.text_content().strip()
699 self.notify_watcher()
701 def load(self):
702 """Wrapper around all necessary load methods."""
703 self.load_book()
704 self.load_toc()
706 def make_contents(self):
707 """Generate HTML containing the table of contents. This can
708 only be done after the main PDF has been made."""
709 header = '<h1>Table of Contents</h1><table class="toc">\n'
710 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
711 '<td class="pagenumber">%s</td></tr>\n')
712 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
713 footer = '\n</table>'
715 contents = []
717 chapter = 1
718 page_num = 1
719 subsections = [] # for the subsection heading pages.
721 outline_contents = iter(self.outline_contents)
722 headings = iter(self.headings)
724 for t in self.toc:
725 if t.is_chapter():
726 try:
727 h1 = headings.next()
728 except StopIteration:
729 log("heading not found for %s (previous h1 missing?). Stopping" % t)
730 break
731 h1_text, level, page_num = outline_contents.next()
732 log("%r %r" % (h1.title, h1_text))
733 contents.append(row_tmpl % (chapter, h1.title, page_num))
734 chapter += 1
735 elif t.is_section():
736 contents.append(section_tmpl % t.title)
737 else:
738 log("mystery TOC item: %s" % t)
740 doc = header + '\n'.join(contents) + footer
741 self.notify_watcher()
742 return doc
744 def add_section_titles(self):
745 """Add any section heading pages that the TOC.txt file
746 specifies. These are sub-book, super-chapter groupings.
748 Also add initial numbers to chapters.
750 headings = iter(self.headings)
751 chapter = 1
752 section = None
754 for t in self.toc:
755 if t.is_chapter() and section is not None:
756 try:
757 h1 = headings.next()
758 except StopIteration:
759 log("heading not found for %s (previous h1 missing?)" % t)
760 break
761 item = h1.makeelement('div', Class='chapter')
762 log(h1.title, debug='HTMLGEN')
763 item.text = h1.title
764 _add_initial_number(item, chapter)
766 section.append(item)
768 if not section_placed:
769 log("placing section", debug='HTMLGEN')
770 h1.addprevious(section)
771 section_placed = True
772 else:
773 log("NOT placing section", debug='HTMLGEN')
775 #put a bold number at the beginning of the h1.
776 _add_initial_number(h1, chapter)
777 chapter += 1
779 elif t.is_section():
780 section = self.tree.makeelement('div', Class="subsection")
781 # section Element complains when you try to ask it whether it
782 # has been placed (though it does know)
783 section_placed = False
784 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
785 heading.set("Class", "subsection-heading")
786 section.append(heading)
788 self.notify_watcher()
791 def add_css(self, css=None, mode='book'):
792 """If css looks like a url, use it as a stylesheet link.
793 Otherwise it is the CSS itself, which is saved to a temporary file
794 and linked to."""
795 log("css is %r" % css)
796 htmltree = self.tree
797 if css is None or not css.strip():
798 defaults = SERVER_DEFAULTS[self.server]
799 url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
800 elif not re.match(r'^http://\S+$', css):
801 fn = self.save_tempfile('objavi.css', css)
802 url = 'file://' + fn
803 else:
804 url = css
805 #XXX for debugging and perhaps sensible anyway
806 #url = url.replace('file:///home/douglas/objavi2', '')
809 #find the head -- it's probably first child but lets not assume.
810 for child in htmltree:
811 if child.tag == 'head':
812 head = child
813 break
814 else:
815 head = htmltree.makeelement('head')
816 htmltree.insert(0, head)
818 link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
819 self.css_url = url
820 self.notify_watcher()
821 return url
823 def set_title(self, title=None):
824 """If a string is supplied, it becomes the book's title.
825 Otherwise a guess is made."""
826 if title:
827 self.title = title
828 else:
829 titles = [x.text_content() for x in self.tree.cssselect('title')]
830 if titles and titles[0]:
831 self.title = titles[0]
832 else:
833 #oh well
834 self.title = 'A Manual About ' + self.book
835 return self.title
837 def _read_localised_template(self, template, fallbacks=['en']):
838 """Try to get the template in the approriate language, otherwise in english."""
839 for lang in [self.lang] + fallbacks:
840 try:
841 fn = template % (lang)
842 f = open(fn)
843 break
844 except IOError, e:
845 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
846 log(e)
847 template = f.read()
848 f.close()
849 return template
851 def compose_inside_cover(self):
852 """create the markup for the preamble inside cover."""
853 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
855 if self.isbn:
856 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
857 else:
858 isbn_text = ''
860 return template % {'date': time.strftime('%Y-%m-%d'),
861 'isbn': isbn_text,
862 'license': self.license,
866 def compose_end_matter(self):
867 """create the markup for the end_matter inside cover. If
868 self.isbn is not set, the html will result in a pdf that
869 spills onto two pages.
871 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
873 d = {'css_url': self.css_url,
874 'title': self.title
877 if self.isbn:
878 d['inside_cover_style'] = ''
879 else:
880 d['inside_cover_style'] = 'page-break-after: always'
882 return template % d
887 def spawn_x(self):
888 """Start an Xvfb instance, using a new server number. A
889 reference to it is stored in self.xvfb, which is used to kill
890 it when the pdf is done.
892 Note that Xvfb doesn't interact well with dbus which is
893 present on modern desktops.
895 #Find an unused server number (in case two cgis are running at once)
896 while True:
897 servernum = random.randrange(50, 500)
898 if not os.path.exists('/tmp/.X%s-lock' % servernum):
899 break
901 self.xserver_no = ':%s' % servernum
903 authfile = self.filepath('Xauthority')
904 os.environ['XAUTHORITY'] = authfile
906 #mcookie(1) eats into /dev/random, so avoid that
907 from hashlib import md5
908 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
909 mcookie = m.hexdigest()
911 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
913 self.xvfb = Popen(['Xvfb', self.xserver_no,
914 '-screen', '0', '1024x768x24',
915 '-pixdepths', '32',
916 #'-blackpixel', '0',
917 #'-whitepixel', str(2 ** 24 -1),
918 #'+extension', 'Composite',
919 '-dpi', '96',
920 '-kb',
921 '-nolisten', 'tcp',
924 # We need to wait a bit before the Xvfb is ready. but the
925 # downloads are so slow that that probably doesn't matter
927 self.xvfb_ready_time = time.time() + 2
929 os.environ['DISPLAY'] = self.xserver_no
930 log(self.xserver_no)
932 def wait_for_xvfb(self):
933 """wait until a previously set time before continuing. This
934 is so Xvfb has time to properly start."""
935 if hasattr(self, 'xvfb'):
936 d = self.xvfb_ready_time - time.time()
937 if d > 0:
938 time.sleep(d)
939 self.notify_watcher()
941 def cleanup_x(self):
942 """Try very hard to kill off Xvfb. In addition to killing
943 this instance's xvfb, occasionally (randomly) search for
944 escaped Xvfb instances and kill those too."""
945 if not hasattr(self, 'xvfb'):
946 return
947 check_call(['xauth', 'remove', self.xserver_no])
948 p = self.xvfb
949 log("trying to kill Xvfb %s" % p.pid)
950 os.kill(p.pid, 15)
951 for i in range(10):
952 if p.poll() is not None:
953 log("%s died with %s" % (p.pid, p.poll()))
954 break
955 log("%s not dead yet" % p.pid)
956 time.sleep(0.2)
957 else:
958 log("Xvfb would not die! kill -9! kill -9!")
959 os.kill(p.pid, 9)
961 if random.random() < 0.05:
962 #kill old xvfbs occasionally, if there are any.
963 self.kill_old_xvfbs()
965 def kill_old_xvfbs(self):
966 """Sometimes, despite everything, Xvfb instances hang around
967 well after they are wanted -- for example if the cgi process
968 dies particularly badly. So kill them if they have been
969 running for a long time."""
970 log("running kill_old_xvfbs")
971 p = Popen(['ps', '-C' 'Xvfb', '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
972 data = p.communicate()[0].strip()
973 if data:
974 lines = data.split('\n')
975 for line in lines:
976 log('dealing with ps output "%s"' % line)
977 try:
978 pid, days_, hours, minutes, seconds = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$').groups()
979 except AttributeError:
980 log("Couldn't parse that line!")
981 # 50 minutes should be enough xvfb time for anyone
982 if days or hours or int(minutes) > 50:
983 log("going to kill pid %s" % pid)
984 os.kill(int(pid), 15)
985 time.sleep(0.5)
986 os.kill(int(pid), 9)
987 self.notify_watcher()
989 def cleanup(self):
990 self.cleanup_x()
991 if not config.KEEP_TEMP_FILES:
992 for fn in os.listdir(self.workdir):
993 os.remove(os.path.join(self.workdir, fn))
994 os.rmdir(self.workdir)
995 else:
996 log("NOT removing '%s', containing the following files:" % self.workdir)
997 log(*os.listdir(self.workdir))
999 self.notify_watcher()