debug to STDERR, not STDOUT
[objavi2.git] / fmbook.py
blob005b4a5ea32dcca40d6f7bd423e312fb2523d0e1
1 # Part of Objavi2, which turns html manuals into books.
2 # This provides abstractions of texts and virtual printers and manages
3 # their interactions.
5 # Copyright (C) 2009 Douglas Bagnall
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License along
18 # with this program; if not, write to the Free Software Foundation, Inc.,
19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 """Library module representing a complete FM book being turned into a
22 PDF"""
24 import os, sys
25 import tempfile
26 import re, time
27 import random
28 from urllib2 import urlopen
29 from subprocess import Popen, check_call, PIPE
31 import lxml.etree, lxml.html
32 import lxml, lxml.html, lxml.etree
34 import config
35 from config import SERVER_DEFAULTS, DEFAULT_SERVER, POINT_2_MM, PDFEDIT_MAX_PAGES
37 TMPDIR = os.path.abspath(config.TMPDIR)
38 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
39 PUBLISH_PATH = "%s/books/" % DOC_ROOT
42 def log(*messages, **kwargs):
43 """Send the messages to the appropriate place (stderr, or syslog).
44 If a <debug> keyword is specified, the message is only printed if
45 its value ias in the global DEBUG_MODES."""
46 if 'debug' not in kwargs or config.DEBUG_ALL or kwargs['debug'] in config.DEBUG_MODES:
47 for m in messages:
48 try:
49 print >> sys.stderr, m
50 except Exception:
51 print >> sys.stderr, repr(m)
53 def _add_initial_number(e, n):
54 """Put a styled chapter number n at the beginning of element e."""
55 initial = e.makeelement("strong", Class="initial")
56 e.insert(0, initial)
57 initial.tail = ' '
58 if e.text is not None:
59 initial.tail += e.text
60 e.text = ''
61 initial.text = "%s." % n
64 class TocItem(object):
65 """This makes sense of the tuples from TOC.txt files"""
66 def __init__(self, status, chapter, title):
67 # status is
68 # 0 - section heading with no chapter
69 # 1 - chapter heading
70 # 2 - book title
72 # chapter is twiki name of the chapter
73 # title is a human readable name of the chapter.
74 self.status = status
75 self.chapter = chapter
76 self.title = title
78 def is_chapter(self):
79 return self.status == '1'
81 def is_section(self):
82 return self.status == '0'
84 def __str__(self):
85 return '<toc: %s>' % ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
88 def run(cmd):
89 try:
90 p = Popen(cmd, stdout=PIPE, stderr=PIPE)
91 out, err = p.communicate()
92 except Exception:
93 log("Failed on command: %r" % cmd)
94 raise
95 log("%s\n%s returned %s and produced\nstdout:%s\nstderr:%s" %
96 (' '.join(cmd), cmd[0], p.poll(), out, err))
99 def find_containing_paper(w, h):
100 size = None
101 for name, pw, ph in config.PAPER_SIZES:
102 if pw >= w and ph >= h:
103 mw = (pw - w) * 0.5
104 mh = (ph - h) * 0.5
105 return (name, mw, mh)
107 raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
108 (w * POINT_2_MM, h * POINT_2_MM))
112 class PageSettings(object):
113 """Calculates and wraps commands for the generation and processing
114 of PDFs"""
115 def __init__(self, pointsize, **kwargs):
116 # the formulas for default gutters, margins and column margins
117 # are quite ad-hoc and certainly improvable.
118 self.width, self.height = pointsize
119 self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
120 self.grey_scale = 'grey_scale' in kwargs
122 self.engine = kwargs.get('engine', config.DEFAULT_ENGINE)
123 # All measurements in points unless otherwise stated
124 # user interaction is in *mm*, but is converted in objavi2.py
125 default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
126 default_gutter = (config.BASE_GUTTER + config.PROPORTIONAL_GUTTER * self.width)
128 self.top_margin = kwargs.get('top_margin', default_margin)
129 self.side_margin = kwargs.get('side_margin', default_margin)
130 self.bottom_margin = kwargs.get('bottom_margin', default_margin)
131 self.gutter = kwargs.get('gutter', default_gutter)
133 self.columns = kwargs.get('columns', 1)
134 if self.columns == 'auto': #default for newspapers is to work out columns
135 self.columns = int(self.width // config.MIN_COLUMN_WIDTH)
137 self.column_margin = kwargs.get('column_margin',
138 default_margin * 2 / (5.0 + self.columns))
140 self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
141 self.number_margin = self.side_margin
143 # calculate margins in mm for browsers
144 self.margins = []
145 for m, clip in ((self.top_margin, clipy),
146 (self.side_margin, clipx + 0.5 * self.gutter),
147 (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
148 (self.side_margin, clipx + 0.5 * self.gutter),
150 self.margins.append((m + clip) * POINT_2_MM)
152 self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
154 if 'PDFGEN' in config.DEBUG_MODES:
155 log("making PageSettings with:")
156 for x in locals().iteritems():
157 log("%s: %s" % x, debug='PDFGEN')
158 for x in dir(self):
159 if not x.startswith('__'):
160 log("self.%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
164 def _webkit_command(self, html, pdf, outline=False):
165 m = [str(x) for x in self.margins]
166 outline_args = ['--outline'] * outline
167 greyscale_args = ['-g'] * self.grey_scale
168 cmd = ([config.WKHTMLTOPDF, '-q', '-s', self.papersize,
169 '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
170 '-d', '100'] + outline_args + greyscale_args +
171 config.WKHTMLTOPDF_EXTRA_COMMANDS + [html, pdf])
172 log(' '.join(cmd))
173 return cmd
175 def _gecko_command(self, html, pdf, outline=False):
176 m = [str(x) for x in self.margins]
177 #firefox -P pdfprint -print URL -printprinter "printer_settings"
178 cmd = [config.FIREFOX, '-P', 'pdfprint', '-print',
179 html, '-printprinter', self.moz_printer]
180 log(' '.join(cmd))
181 return cmd
183 def make_raw_pdf(self, html, pdf, outline=False):
184 func = getattr(self, '_%s_command' % self.engine)
185 if self.columns == 1:
186 cmd = func(html, pdf, outline=outline)
187 run(cmd)
188 else:
189 printable_width = self.width - 2.0 * self.side_margin - self.gutter
190 column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
191 page_width = column_width + self.column_margin
192 side_margin = self.column_margin * 0.5
193 if 'PDFGEN' in config.DEBUG_MODES:
194 log("making columns with:")
195 for k, v in locals().iteritems():
196 log("%s: %r" % (k, v))
197 for k in ('width', 'side_margin', 'gutter', 'column_margin', 'columns', 'height'):
198 log("self.%s: %r" % (k, getattr(self, k)))
200 columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
201 gutter=0, top_margin=self.top_margin,
202 side_margin=side_margin,
203 bottom_margin=self.bottom_margin,
204 grey_scale=self.grey_scale,
205 engine=self.engine
208 column_pdf = pdf[:-4] + '-single-column.pdf'
209 columnmaker.make_raw_pdf(html, column_pdf, outline=outline)
210 columnmaker.reshape_pdf(column_pdf)
211 cmd = ['pdfnup',
212 '--nup', '%sx1' % int(self.columns),
213 '--paper', self.papersize.lower() + 'paper',
214 '--outfile', pdf,
215 '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
216 '--noautoscale', 'true',
217 '--orient', 'portrait',
218 #'--tidy', 'false',
219 column_pdf
222 run(cmd)
226 def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
227 even_pages=True):
228 """Spin the pdf for RTL text, resize it to the right size, and
229 shift the gutter left and right"""
230 ops = 'resize'
231 if self.gutter:
232 ops += ',shift'
233 if even_pages:
234 ops += ',even_pages'
235 gutter = self.gutter
236 if dir == 'RTL':
237 gutter = -gutter
238 cmd = ['pdfedit', '-s', 'wk_objavi.qs',
239 'dir=%s' % dir,
240 'filename=%s' % pdf,
241 'output_filename=%s' % pdf,
242 'operation=%s' % ops,
243 'width=%s' % self.width,
244 'height=%s' % self.height,
245 'offset=%s' % gutter,
246 'centre_start=%s' % centre_start,
247 'centre_end=%s' % centre_end,
249 run(cmd)
251 def _number_pdf(self, pdf, numbers='latin', dir='LTR',
252 number_start=1):
253 cmd = ['pdfedit', '-s', 'wk_objavi.qs',
254 'operation=page_numbers',
255 'dir=%s' % dir,
256 'filename=%s' % pdf,
257 'output_filename=%s' % pdf,
258 'number_start=%s' % number_start,
259 'number_style=%s' % numbers,
260 'number_bottom=%s' % self.number_bottom,
261 'number_margin=%s' % self.number_margin,
263 run(cmd)
265 def number_pdf(self, pdf, pages, **kwargs):
266 # if there are too many pages for pdfedit to handle in one go,
267 # split the job into bits. <pages> may not be exact
268 if pages is None or pages <= PDFEDIT_MAX_PAGES:
269 self._number_pdf(pdf, **kwargs)
270 else:
271 # section_size must be even
272 sections = pages // PDFEDIT_MAX_PAGES + 1
273 section_size = (pages // sections + 2) & ~1
275 pdf_sections = []
276 s = kwargs.pop('number_start', 1)
277 while s < pages:
278 e = s + section_size - 1
279 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
280 if e < pages - 1:
281 page_range = '%s-%s' % (s, e)
282 else:
283 page_range = '%s-end' % s
284 run(['pdftk',
285 pdf,
286 'cat',
287 page_range,
288 'output',
289 pdf_section,
291 self._number_pdf(pdf_section, number_start=s, **kwargs)
292 pdf_sections.append(pdf_section)
293 s = e + 1
295 concat_pdfs(pdf, *pdf_sections)
297 def make_barcode_pdf(self, isbn, pdf, corner='br'):
298 """Put an ISBN barcode in a corner of a single blank page."""
300 position = '%s,%s,%s,%s,%s' %(corner, self.width, self.height, self.side_margin, self.bottom_margin)
301 cmd1 = [config.BOOKLAND,
302 '--position', position,
303 str(isbn)]
304 cmd2 = ['ps2pdf',
305 '-dFIXEDMEDIA',
306 '-dDEVICEWIDTHPOINTS=%s' % self.width,
307 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
308 '-', pdf]
310 p1 = Popen(cmd1, stdout=PIPE)
311 p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
312 out, err = p2.communicate()
314 log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
315 log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
318 def count_pdf_pages(pdf):
319 """How many pages in the PDF?"""
320 #XXX could also use python-pypdf or python-poppler
321 cmd = ('pdfinfo', pdf)
322 p = Popen(cmd, stdout=PIPE, stderr=PIPE)
323 out, err = p.communicate()
324 m = re.search(r'^\s*Pages:\s*(\d+)\s*$', out, re.MULTILINE)
325 return int(m.group(1))
328 def concat_pdfs(destination, *pdfs):
329 """Join all the named pdfs together into one and save it as <name>"""
330 cmd = ['pdftk']
331 cmd.extend(x for x in pdfs if x is not None)
332 cmd += ['cat', 'output', destination]
333 run(cmd)
335 def index_pdf(pdf, text=None):
336 """Use pdftotext to extract utf-8 text from a pdf, using ^L to
337 separate pages."""
338 if text is None:
339 text = pdf + '.index.txt'
340 cmd = ['pdftotext',
341 #'-layout', #keeps more original formatting
342 pdf,
343 text]
344 run(cmd)
345 return text
347 def rotate_pdf(pdfin, pdfout):
348 """Turn the PDF on its head"""
349 cmd = ['pdftk', pdfin,
350 'cat',
351 '1-endD',
352 'output',
353 pdfout
355 run(cmd)
357 def parse_outline(pdf, level_threshold):
358 """Create a structure reflecting the outline of a PDF.
359 A chapter heading looks like this:
361 BookmarkTitle: 2. What is sound?
362 BookmarkLevel: 1
363 BookmarkPageNumber: 3
365 cmd = ('pdftk', pdf, 'dump_data')
366 p = Popen(cmd, stdout=PIPE, stderr=PIPE)
367 outline, err = p.communicate()
368 lines = (x.strip() for x in outline.split('\n') if x.strip())
369 contents = []
371 def extract(expected, conv=str.strip):
372 line = lines.next()
373 try:
374 k, v = line.split(':', 1)
375 if k == expected:
376 return conv(v)
377 except ValueError:
378 log("trouble with line %r" %line)
380 #There are a few useless variables, then the pagecount, then the contents.
381 #The pagecount is useful, so pick it up first.
382 page_count = None
383 while page_count == None:
384 page_count = extract('NumberOfPages', int)
386 try:
387 while True:
388 title = extract('BookmarkTitle')
389 if title is not None:
390 level = extract('BookmarkLevel', int)
391 pagenum = extract('BookmarkPageNumber', int)
392 if level <= level_threshold and None not in (level, pagenum):
393 contents.append((title, level, pagenum))
394 except StopIteration:
395 pass
397 return contents, outline, page_count
400 class Book(object):
401 page_numbers = 'latin'
402 preamble_page_numbers = 'roman'
404 def notify_watcher(self, message=None):
405 if self.watcher:
406 if message is None:
407 #message is the name of the caller
408 #XXX look at using inspect module
409 import traceback
410 message = traceback.extract_stack(None, 2)[0][2]
411 log("notify_watcher called with '%s'" % message)
412 self.watcher(message)
414 def __enter__(self):
415 return self
417 def __exit__(self, exc_type, exc_value, traceback):
418 self.cleanup()
419 #could deal with exceptions here and return true
421 def __init__(self, book, server, bookname,
422 page_settings=None, watcher=None, isbn=None,
423 license=config.DEFAULT_LICENSE):
424 log("*** Starting new book %s ***" % bookname)
425 self.book = book
426 self.server = server
427 self.watcher = watcher
428 self.isbn = isbn
429 self.license = license
430 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
431 os.chmod(self.workdir, 0755)
432 defaults = SERVER_DEFAULTS[server]
433 self.lang = defaults['lang']
434 self.dir = defaults['dir']
436 self.body_html_file = self.filepath('body.html')
437 self.body_pdf_file = self.filepath('body.pdf')
438 self.body_index_file = self.filepath('body.txt')
439 self.preamble_html_file = self.filepath('preamble.html')
440 self.preamble_pdf_file = self.filepath('preamble.pdf')
441 self.tail_html_file = self.filepath('tail.html')
442 self.tail_pdf_file = self.filepath('tail.pdf')
443 self.isbn_pdf_file = None
444 self.pdf_file = self.filepath('final.pdf')
445 self.body_odt_file = self.filepath('body.odt')
447 self.publish_name = bookname
448 self.publish_file = os.path.join(PUBLISH_PATH, self.publish_name)
449 self.publish_url = os.path.join(config.PUBLISH_URL, self.publish_name)
451 self.book_url = config.BOOK_URL % (self.server, self.book)
452 self.toc_url = config.TOC_URL % (self.server, self.book)
453 if page_settings is not None:
454 self.maker = PageSettings(**page_settings)
456 self.notify_watcher()
458 if config.TRY_BOOK_CLEANUP_ON_DEL:
459 #Dont even define __del__ if it is not used.
460 _try_cleanup_on_del = True
461 def __del__(self):
462 if self._try_cleanup_on_del and os.path.exists(self.workdir):
463 self._try_cleanup_on_del = False #or else you can get in bad cycles
464 self.cleanup()
466 def __getattr__(self, attr):
467 """catch unloaded books and load them"""
468 #log('looking for missing attribute "%s"' % (attr))
469 if attr == 'tree':
470 self.load_book()
471 return self.tree
472 if attr == 'toc':
473 self.load_toc()
474 return self.toc
475 raise AttributeError("no such member: '%s'" % attr)
478 def filepath(self, fn):
479 return os.path.join(self.workdir, fn)
481 def save_data(self, fn, data):
482 """Save without tripping up on unicode"""
483 if isinstance(data, unicode):
484 data = data.encode('utf8', 'ignore')
485 f = open(fn, 'w')
486 f.write(data)
487 f.close()
489 def save_tempfile(self, fn, data):
490 """Save the data in a temporary directory that will be cleaned
491 up when all is done. Return the absolute file path."""
492 fn = self.filepath(fn)
493 self.save_data(fn, data)
494 return fn
496 def make_oo_doc(self):
497 """Make an openoffice document, using the html2odt script."""
498 self.wait_for_xvfb()
499 html_text = lxml.etree.tostring(self.tree, method="html")
500 self.save_data(self.body_html_file, html_text)
501 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
502 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
503 os.rename(self.body_odt_file, self.publish_file)
504 self.notify_watcher()
506 def extract_pdf_outline(self):
507 self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
508 for x in self.outline_contents:
509 log(x)
510 self.notify_watcher()
511 return number_of_pages
513 def make_body_pdf(self):
514 """Make a pdf of the HTML, using webkit"""
515 #1. Save the html
516 html_text = lxml.etree.tostring(self.tree, method="html")
517 self.save_data(self.body_html_file, html_text)
519 #2. Make a pdf of it
520 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
521 self.notify_watcher('generate_pdf')
523 n_pages = self.extract_pdf_outline()
525 log ("found %s pages in pdf" % n_pages)
526 #4. resize pages, shift gutters, even pages
527 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
528 self.notify_watcher('reshape_pdf')
530 #5 add page numbers
531 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
532 numbers=self.page_numbers)
533 self.notify_watcher("number_pdf")
534 self.notify_watcher()
536 def make_preamble_pdf(self):
537 contents = self.make_contents()
538 inside_cover_html = self.compose_inside_cover()
539 html = ('<html dir="%s"><head>\n'
540 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
541 '<link rel="stylesheet" href="%s" />\n'
542 '</head>\n<body>\n'
543 '<h1 class="frontpage">%s</h1>'
544 '%s\n'
545 '<div class="contents">%s</div>\n'
546 '<div style="page-break-after: always; color:#fff" class="unseen">.'
547 '<!--%s--></div></body></html>'
548 ) % (self.dir, self.css_url, self.title, inside_cover_html,
549 contents, self.title)
550 self.save_data(self.preamble_html_file, html)
552 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
555 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
557 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
558 numbers=self.preamble_page_numbers,
559 number_start=-2)
561 self.notify_watcher()
563 def make_end_matter_pdf(self):
564 """Make an inside back cover and a back cover. If there is an
565 isbn number its barcode will be put on the back cover."""
566 if self.isbn:
567 self.isbn_pdf_file = self.filepath('isbn.pdf')
568 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
569 self.notify_watcher('make_barcode_pdf')
571 self.save_data(self.tail_html_file, self.compose_end_matter())
572 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
574 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
575 centre_end=True, even_pages=False)
576 self.notify_watcher()
578 def make_book_pdf(self):
579 """A convenient wrapper of a few necessary steps"""
580 # now the Xvfb server is needed. make sure it has had long enough to get going
581 self.wait_for_xvfb()
582 self.make_body_pdf()
583 self.make_preamble_pdf()
584 self.make_end_matter_pdf()
586 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
587 self.body_pdf_file, self.tail_pdf_file,
588 self.isbn_pdf_file)
590 self.notify_watcher('concatenated_pdfs')
593 def make_simple_pdf(self, mode):
594 """Make a simple pdf document without contents or separate
595 title page. This is used for multicolumn newspapers and for
596 web-destined pdfs."""
597 self.wait_for_xvfb()
598 #0. Add heading to begining of html
599 body = list(self.tree.cssselect('body'))[0]
600 e = body.makeelement('h1', {'id': 'book-title'})
601 e.text = self.title
602 body.insert(0, e)
603 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
604 e.addnext(intro)
606 #0.5 adjust parameters to suit the particular kind of output
607 if mode == 'web':
608 self.maker.gutter = 0
610 #1. Save the html
611 html_text = lxml.etree.tostring(self.tree, method="html")
612 self.save_data(self.body_html_file, html_text)
614 #2. Make a pdf of it (direct to to final pdf)
615 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
616 self.notify_watcher('generate_pdf')
617 #n_pages = self.extract_pdf_outline()
618 n_pages = count_pdf_pages(self.pdf_file)
620 if mode != 'web':
621 #3. resize pages and shift gutters.
622 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
623 self.notify_watcher('reshape_pdf')
625 #4. add page numbers
626 self.maker.number_pdf(self.pdf_file, n_pages,
627 dir=self.dir, numbers=self.page_numbers)
628 self.notify_watcher("number_pdf")
629 self.notify_watcher()
632 def rotate180(self):
633 """Rotate the pdf 180 degrees so an RTL book can print on LTR
634 presses."""
635 rotated = self.filepath('final-rotate.pdf')
636 unrotated = self.filepath('final-pre-rotate.pdf')
637 #leave the unrotated pdf intact at first, in case of error.
638 rotate_pdf(self.pdf_file, rotated)
639 os.rename(self.pdf_file, unrotated)
640 os.rename(rotated, self.pdf_file)
641 self.notify_watcher()
643 def publish_pdf(self):
644 """Move the finished PDF to its final resting place"""
645 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
646 os.rename(self.pdf_file, self.publish_file)
647 self.notify_watcher()
649 def load_toc(self):
650 """From the TOC.txt file create a list of TocItems with
651 the attributes <status>, <chapter>, and <title>.
653 <status> is a number, with the following meaning:
655 0 - section heading with no chapter
656 1 - chapter heading
657 2 - book title
659 The TocItem object has convenience functions <is_chapter> and
660 <is_section>.
662 <chapter> is twiki name of the chapter.
664 <title> is a human readable title for the chapter. It is likely to
665 differ from the title given in the chapter's <h1> heading.
667 f = urlopen(self.toc_url)
668 self.toc = []
669 while True:
670 try:
671 self.toc.append(TocItem(f.next().strip(),
672 f.next().strip(),
673 f.next().strip()))
674 except StopIteration:
675 break
676 f.close()
677 self.notify_watcher()
679 def load_book(self, tidy=True):
680 """Fetch and parse the raw html of the book. If tidy is true
681 (default) links in the document will be made absolute."""
682 f = urlopen(self.book_url)
683 html = f.read()
684 f.close()
685 html = ('<html dir="%s"><head>\n<title>%s</title>\n'
686 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
687 '</head>\n<body>\n'
688 '%s\n'
689 '<div style="page-break-before: always; color:#fff;" class="unseen">'
690 'A FLOSSManuals book</div>\n</body></html>'
691 ) % (self.dir, self.book, html)
693 self.save_tempfile('raw.html', html)
695 tree = lxml.html.document_fromstring(html)
696 if tidy:
697 tree.make_links_absolute(self.book_url)
698 self.tree = tree
699 self.headings = [x for x in tree.cssselect('h1')]
700 if self.headings:
701 self.headings[0].set('class', "first-heading")
702 for h1 in self.headings:
703 h1.title = h1.text_content().strip()
704 self.notify_watcher()
706 def load(self):
707 """Wrapper around all necessary load methods."""
708 self.load_book()
709 self.load_toc()
711 def make_contents(self):
712 """Generate HTML containing the table of contents. This can
713 only be done after the main PDF has been made."""
714 header = '<h1>Table of Contents</h1><table class="toc">\n'
715 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
716 '<td class="pagenumber">%s</td></tr>\n')
717 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
718 footer = '\n</table>'
720 contents = []
722 chapter = 1
723 page_num = 1
724 subsections = [] # for the subsection heading pages.
726 outline_contents = iter(self.outline_contents)
727 headings = iter(self.headings)
729 for t in self.toc:
730 if t.is_chapter():
731 try:
732 h1 = headings.next()
733 except StopIteration:
734 log("heading not found for %s (previous h1 missing?). Stopping" % t)
735 break
736 h1_text, level, page_num = outline_contents.next()
737 log("%r %r" % (h1.title, h1_text))
738 contents.append(row_tmpl % (chapter, h1.title, page_num))
739 chapter += 1
740 elif t.is_section():
741 contents.append(section_tmpl % t.title)
742 else:
743 log("mystery TOC item: %s" % t)
745 doc = header + '\n'.join(contents) + footer
746 self.notify_watcher()
747 return doc
749 def add_section_titles(self):
750 """Add any section heading pages that the TOC.txt file
751 specifies. These are sub-book, super-chapter groupings.
753 Also add initial numbers to chapters.
755 headings = iter(self.headings)
756 chapter = 1
757 section = None
759 for t in self.toc:
760 if t.is_chapter() and section is not None:
761 try:
762 h1 = headings.next()
763 except StopIteration:
764 log("heading not found for %s (previous h1 missing?)" % t)
765 break
766 item = h1.makeelement('div', Class='chapter')
767 log(h1.title, debug='HTMLGEN')
768 item.text = h1.title
769 _add_initial_number(item, chapter)
771 section.append(item)
773 if not section_placed:
774 log("placing section", debug='HTMLGEN')
775 h1.addprevious(section)
776 section_placed = True
777 else:
778 log("NOT placing section", debug='HTMLGEN')
780 #put a bold number at the beginning of the h1.
781 _add_initial_number(h1, chapter)
782 chapter += 1
784 elif t.is_section():
785 section = self.tree.makeelement('div', Class="subsection")
786 # section Element complains when you try to ask it whether it
787 # has been placed (though it does know)
788 section_placed = False
789 heading = lxml.html.fragment_fromstring(t.title, create_parent='div')
790 heading.set("Class", "subsection-heading")
791 section.append(heading)
793 self.notify_watcher()
796 def add_css(self, css=None, mode='book'):
797 """If css looks like a url, use it as a stylesheet link.
798 Otherwise it is the CSS itself, which is saved to a temporary file
799 and linked to."""
800 log("css is %r" % css)
801 htmltree = self.tree
802 if css is None or not css.strip():
803 defaults = SERVER_DEFAULTS[self.server]
804 url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
805 elif not re.match(r'^http://\S+$', css):
806 fn = self.save_tempfile('objavi.css', css)
807 url = 'file://' + fn
808 else:
809 url = css
810 #XXX for debugging and perhaps sensible anyway
811 #url = url.replace('file:///home/douglas/objavi2', '')
814 #find the head -- it's probably first child but lets not assume.
815 for child in htmltree:
816 if child.tag == 'head':
817 head = child
818 break
819 else:
820 head = htmltree.makeelement('head')
821 htmltree.insert(0, head)
823 link = lxml.etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
824 self.css_url = url
825 self.notify_watcher()
826 return url
828 def set_title(self, title=None):
829 """If a string is supplied, it becomes the book's title.
830 Otherwise a guess is made."""
831 if title:
832 self.title = title
833 else:
834 titles = [x.text_content() for x in self.tree.cssselect('title')]
835 if titles and titles[0]:
836 self.title = titles[0]
837 else:
838 #oh well
839 self.title = 'A Manual About ' + self.book
840 return self.title
842 def _read_localised_template(self, template, fallbacks=['en']):
843 """Try to get the template in the approriate language, otherwise in english."""
844 for lang in [self.lang] + fallbacks:
845 try:
846 fn = template % (lang)
847 f = open(fn)
848 break
849 except IOError, e:
850 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
851 log(e)
852 template = f.read()
853 f.close()
854 return template
856 def compose_inside_cover(self):
857 """create the markup for the preamble inside cover."""
858 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
860 if self.isbn:
861 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
862 else:
863 isbn_text = ''
865 return template % {'date': time.strftime('%Y-%m-%d'),
866 'isbn': isbn_text,
867 'license': self.license,
871 def compose_end_matter(self):
872 """create the markup for the end_matter inside cover. If
873 self.isbn is not set, the html will result in a pdf that
874 spills onto two pages.
876 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
878 d = {'css_url': self.css_url,
879 'title': self.title
882 if self.isbn:
883 d['inside_cover_style'] = ''
884 else:
885 d['inside_cover_style'] = 'page-break-after: always'
887 return template % d
892 def spawn_x(self):
893 """Start an Xvfb instance, using a new server number. A
894 reference to it is stored in self.xvfb, which is used to kill
895 it when the pdf is done.
897 Note that Xvfb doesn't interact well with dbus which is
898 present on modern desktops.
900 #Find an unused server number (in case two cgis are running at once)
901 while True:
902 servernum = random.randrange(50, 500)
903 if not os.path.exists('/tmp/.X%s-lock' % servernum):
904 break
906 self.xserver_no = ':%s' % servernum
908 authfile = self.filepath('Xauthority')
909 os.environ['XAUTHORITY'] = authfile
911 #mcookie(1) eats into /dev/random, so avoid that
912 from hashlib import md5
913 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
914 mcookie = m.hexdigest()
916 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
918 self.xvfb = Popen(['Xvfb', self.xserver_no,
919 '-screen', '0', '1024x768x24',
920 '-pixdepths', '32',
921 #'-blackpixel', '0',
922 #'-whitepixel', str(2 ** 24 -1),
923 #'+extension', 'Composite',
924 '-dpi', '96',
925 '-kb',
926 '-nolisten', 'tcp',
929 # We need to wait a bit before the Xvfb is ready. but the
930 # downloads are so slow that that probably doesn't matter
932 self.xvfb_ready_time = time.time() + 2
934 os.environ['DISPLAY'] = self.xserver_no
935 log(self.xserver_no)
937 def wait_for_xvfb(self):
938 """wait until a previously set time before continuing. This
939 is so Xvfb has time to properly start."""
940 if hasattr(self, 'xvfb'):
941 d = self.xvfb_ready_time - time.time()
942 if d > 0:
943 time.sleep(d)
944 self.notify_watcher()
946 def cleanup_x(self):
947 """Try very hard to kill off Xvfb. In addition to killing
948 this instance's xvfb, occasionally (randomly) search for
949 escaped Xvfb instances and kill those too."""
950 if not hasattr(self, 'xvfb'):
951 return
952 check_call(['xauth', 'remove', self.xserver_no])
953 p = self.xvfb
954 log("trying to kill Xvfb %s" % p.pid)
955 os.kill(p.pid, 15)
956 for i in range(10):
957 if p.poll() is not None:
958 log("%s died with %s" % (p.pid, p.poll()))
959 break
960 log("%s not dead yet" % p.pid)
961 time.sleep(0.2)
962 else:
963 log("Xvfb would not die! kill -9! kill -9!")
964 os.kill(p.pid, 9)
966 if random.random() < 0.1:
967 # occasionally kill old xvfbs and soffices, if there are any.
968 self.kill_old_processes()
970 def kill_old_processes(self):
971 """Sometimes, despite everything, Xvfb or soffice instances
972 hang around well after they are wanted -- for example if the
973 cgi process dies particularly badly. So kill them if they have
974 been running for a long time."""
975 log("running kill_old_processes")
976 p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
977 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
978 data = p.communicate()[0].strip()
979 if data:
980 lines = data.split('\n')
981 for line in lines:
982 log('dealing with ps output "%s"' % line)
983 try:
984 pid, days, hours, minutes, seconds \
985 = re.match(r'^(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
986 except AttributeError:
987 log("Couldn't parse that line!")
988 # 50 minutes should be enough xvfb time for anyone
989 if days or hours or int(minutes) > 50:
990 log("going to kill pid %s" % pid)
991 os.kill(int(pid), 15)
992 time.sleep(0.5)
993 try:
994 os.kill(int(pid), 9)
995 log('killing %s with -9')
996 except OSError, e:
997 pass
998 self.notify_watcher()
1000 def cleanup(self):
1001 self.cleanup_x()
1002 if not config.KEEP_TEMP_FILES:
1003 for fn in os.listdir(self.workdir):
1004 os.remove(os.path.join(self.workdir, fn))
1005 os.rmdir(self.workdir)
1006 else:
1007 log("NOT removing '%s', containing the following files:" % self.workdir)
1008 log(*os.listdir(self.workdir))
1010 self.notify_watcher()