quieter but more useful logging in twiki import, and try for utf-8 credits
[objavi2.git] / objavi / fmbook.py
blob0fad013fc93cd1e6ea56e59b14325ea8819e59df
1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
21 PDF"""
23 import os, sys
24 import tempfile
25 import re, time
26 import random
27 from subprocess import Popen, check_call, PIPE
28 from cStringIO import StringIO
29 from urllib2 import urlopen, HTTPError
30 import zipfile
31 import traceback
32 from string import ascii_letters
33 from pprint import pformat
35 try:
36 import simplejson as json
37 except ImportError:
38 import json
40 import lxml, lxml.html
41 from lxml import etree
43 from objavi import config, epub_utils
44 from objavi.cgi_utils import log, run, shift_file, make_book_name, guess_lang, guess_text_dir
45 from objavi.pdf import PageSettings, count_pdf_pages, concat_pdfs, rotate_pdf, parse_outline
46 from objavi.epub import add_guts, _find_tag
48 from iarchive import epub as ia_epub
49 from booki.xhtml_utils import EpubChapter
50 from booki.bookizip import get_metadata, add_metadata, clear_metadata, get_metadata_schemes
52 TMPDIR = os.path.abspath(config.TMPDIR)
53 DOC_ROOT = os.environ.get('DOCUMENT_ROOT', '.')
54 HTTP_HOST = os.environ.get('HTTP_HOST', '')
55 PUBLISH_PATH = "%s/books/" % DOC_ROOT
58 def _get_best_title(tocpoint):
59 if 'html_title' in tocpoint:
60 return tocpoint['html_title']
61 if 'title' in tocpoint:
62 return tocpoint['title']
63 return 'Untitled'
66 def _add_initial_number(e, n):
67 """Put a styled chapter number n at the beginning of element e."""
68 initial = e.makeelement("strong", Class="initial")
69 e.insert(0, initial)
70 initial.tail = ' '
71 if e.text is not None:
72 initial.tail += e.text
73 e.text = ''
74 initial.text = "%s." % n
76 def expand_toc(toc, depth=1, index=0):
77 """Reformat toc slightly for convenience"""
78 for item in toc:
79 url = item['url'].lstrip('/')
80 bits = url.split('#', 1)
81 filename = bits[0]
82 fragment = (bits[1] if len(bits) == 2 else None)
83 item['depth'] = depth
84 item["filename"] = filename
85 item["fragment"] = fragment
86 item["index"] = index
87 index += 1
88 if 'children' in item:
89 index = expand_toc(item['children'], depth + 1, index)
90 return index
92 def _serialise(rtoc, stoc, depth):
93 for item in rtoc:
94 url = item['url'].lstrip('/')
95 bits = url.split('#', 1)
96 filename = bits[0]
97 fragment = (bits[1] if len(bits) == 2 else None)
98 stoc.append({"depth": depth,
99 "title": item['title'],
100 "url": url,
101 "filename": filename,
102 "fragment": fragment,
103 "type": item['type']
105 if 'children' in item:
106 _serialise(item['children'], stoc, depth + 1)
109 def serialise_toc(rtoc):
110 """Take the recursive TOC structure and turn it into a list of
111 serial points. Reformat some things for convenience."""
112 stoc = []
113 _serialise(rtoc, stoc, 1)
114 for i, x in enumerate(stoc):
115 x['position'] = i
116 return stoc
118 def filename_toc_map(rtoc):
119 tocmap = {}
120 log(rtoc)
121 def traverse(toc):
122 for point in toc:
123 log(point.keys())
124 tocmap.setdefault(point['filename'], []).append(point)
125 if 'children' in point:
126 traverse(point['children'])
127 traverse(rtoc)
128 return tocmap
131 class Book(object):
132 page_numbers = 'latin'
133 preamble_page_numbers = 'roman'
135 def notify_watcher(self, message=None):
136 if self.watcher:
137 if message is None:
138 #message is the name of the caller
139 message = traceback.extract_stack(None, 2)[0][2]
140 log("notify_watcher called with '%s'" % message)
141 self.watcher(message)
143 def __enter__(self):
144 return self
146 def __exit__(self, exc_type, exc_value, traceback):
147 self.notify_watcher('finished')
148 self.cleanup()
149 #could deal with exceptions here and return true
152 def __init__(self, book, server, bookname, project=None,
153 page_settings=None, watcher=None, isbn=None,
154 license=config.DEFAULT_LICENSE, title=None,
155 max_age=0):
156 log("*** Starting new book %s ***" % bookname,
157 "starting zipbook with", server, book, project)
158 self.watcher = watcher
159 self.notify_watcher('start')
160 self.bookname = bookname
161 self.book = book
162 self.server = server
163 self.project = project
164 self.cookie = ''.join(random.sample(ascii_letters, 10))
165 try:
166 blob = fetch_zip(server, book, project, save=True, max_age=max_age)
167 except HTTPError, e:
168 #log(e.url)
169 traceback.print_exc()
170 self.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e.url, e.code, e.msg))
171 #not much to do?
172 sys.exit()
173 f = StringIO(blob)
174 self.notify_watcher('fetch_zip')
175 self.store = zipfile.ZipFile(f, 'r')
176 self.info = json.loads(self.store.read('info.json'))
177 for k in ('manifest', 'metadata', 'spine', 'TOC'):
178 if k not in self.info:
179 raise ObjaviError('info.json of %s lacks vital element "%s"' %
180 (bookname, k))
181 #check types also?
183 self.metadata = self.info['metadata']
184 self.spine = self.info['spine']
185 self.manifest = self.info['manifest']
187 if server == config.LOCALHOST: # [DEPRECATED]
188 server = get_metadata(self.metadata, 'server', ns=config.FM, default=[server])[0]
189 book = get_metadata(self.metadata, 'book', ns=config.FM, default=[book])[0]
191 log(pformat(self.metadata))
192 self.lang = get_metadata(self.metadata, 'language', default=[None])[0]
193 if not self.lang:
194 self.lang = guess_lang(server, book)
195 log('guessed lang as %s' % self.lang)
197 self.toc_header = get_metadata(self.metadata, 'toc_header', ns=config.FM, default=[None])[0]
198 if not self.toc_header:
199 self.toc_header = config.SERVER_DEFAULTS[server]['toc_header']
201 self.dir = get_metadata(self.metadata, 'dir', ns=config.FM, default=[None])[0]
202 if not self.dir:
203 self.dir = guess_text_dir(server, book)
206 #Patch in the extra metadata. (lang and dir may be set from config)
207 #these should be read from zip -- so should go into zip?
208 for var, key, scheme, ns in (
209 (isbn, 'id', 'ISBN', config.DC),
210 (license, 'rights', 'License', config.DC),
211 (title, 'title', '', config.DC),
212 (self.lang, 'language', '', config.DC),
213 (self.dir, 'dir', '', config.FM),
215 if var is not None:
216 add_metadata(self.metadata, key, var, scheme=scheme, ns=ns)
218 self.isbn = get_metadata(self.metadata, 'id', scheme='ISBN', default=[None])[0]
219 self.license = get_metadata(self.metadata, 'rights', scheme='License', default=[None])[0]
221 self.toc = self.info['TOC']
222 expand_toc(self.toc)
224 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=TMPDIR)
225 os.chmod(self.workdir, 0755)
227 self.body_html_file = self.filepath('body.html')
228 self.body_pdf_file = self.filepath('body.pdf')
229 self.preamble_html_file = self.filepath('preamble.html')
230 self.preamble_pdf_file = self.filepath('preamble.pdf')
231 self.tail_html_file = self.filepath('tail.html')
232 self.tail_pdf_file = self.filepath('tail.pdf')
233 self.isbn_pdf_file = None
234 self.pdf_file = self.filepath('final.pdf')
235 self.body_odt_file = self.filepath('body.odt')
237 self.publish_file = os.path.join(PUBLISH_PATH, bookname)
238 self.publish_url = os.path.join(config.PUBLISH_URL, bookname)
240 if page_settings is not None:
241 self.maker = PageSettings(**page_settings)
243 titles = get_metadata(self.metadata, 'title')
244 if titles:
245 self.title = titles[0]
246 else:
247 self.title = 'A Manual About ' + self.book
249 self.notify_watcher()
252 if config.TRY_BOOK_CLEANUP_ON_DEL:
253 #Dont even define __del__ if it is not used.
254 _try_cleanup_on_del = True
255 def __del__(self):
256 if self._try_cleanup_on_del and os.path.exists(self.workdir):
257 self._try_cleanup_on_del = False #or else you can get in bad cycles
258 self.cleanup()
260 def get_tree_by_id(self, id):
261 """get an HTML tree from the given manifest ID"""
262 name = self.manifest[id]['url']
263 mimetype = self.manifest[id]['mimetype']
264 s = self.store.read(name)
265 f = StringIO(s)
266 if mimetype == 'text/html':
267 try:
268 tree = lxml.html.parse(f)
269 except etree.XMLSyntaxError, e:
270 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
271 (id, name, s[:20], e))
272 tree = lxml.html.document_fromstring('<html><body></body></html>').getroottree()
273 elif 'xml' in mimetype: #XXX or is this just asking for trouble?
274 tree = etree.parse(f)
275 else:
276 tree = f.read()
277 f.close()
278 return tree
280 def filepath(self, fn):
281 return os.path.join(self.workdir, fn)
283 def save_data(self, fn, data):
284 """Save without tripping up on unicode"""
285 if isinstance(data, unicode):
286 data = data.encode('utf8', 'ignore')
287 f = open(fn, 'w')
288 f.write(data)
289 f.close()
291 def save_tempfile(self, fn, data):
292 """Save the data in a temporary directory that will be cleaned
293 up when all is done. Return the absolute file path."""
294 fn = self.filepath(fn)
295 self.save_data(fn, data)
296 return fn
298 def make_oo_doc(self):
299 """Make an openoffice document, using the html2odt script."""
300 self.wait_for_xvfb()
301 html_text = etree.tostring(self.tree, method="html")
302 self.save_data(self.body_html_file, html_text)
303 run([config.HTML2ODT, self.workdir, self.body_html_file, self.body_odt_file])
304 log("Publishing %r as %r" % (self.body_odt_file, self.publish_file))
305 os.rename(self.body_odt_file, self.publish_file)
306 self.notify_watcher()
308 def extract_pdf_outline(self):
309 #self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
310 debugf = self.filepath('outline.txt')
311 self.outline_contents, self.outline_text, number_of_pages = \
312 parse_outline(self.body_pdf_file, 1, debugf)
314 if not self.outline_contents:
315 #probably problems with international text. need a horrible hack
316 log('no outline: trying again with ascii headings')
317 import copy
318 tree = copy.deepcopy(self.tree)
319 titlemap = {}
320 for tag in ('h1', 'h2', 'h3', 'h4'):
321 for i, e in enumerate(tree.getiterator(tag)):
322 key = "%s_%s" % (tag, i)
323 titlemap[key] = e.text_content().strip(config.WHITESPACE_AND_NULL)
324 del e[:]
325 if tag == 'h1':
326 e = lxml.etree.SubElement(e, "strong", Class="initial")
327 e.text = key
328 log("key: %r, text: %r, value: %r" %(key, e.text, titlemap[key]))
330 ascii_html_file = self.filepath('body-ascii-headings.html')
331 ascii_pdf_file = self.filepath('body-ascii-headings.pdf')
332 html_text = lxml.etree.tostring(tree, method="html")
333 self.save_data(ascii_html_file, html_text)
334 self.maker.make_raw_pdf(ascii_html_file, ascii_pdf_file, outline=True)
335 debugf = self.filepath('ascii_outline.txt')
336 ascii_contents, ascii_text, number_of_ascii_pages = \
337 parse_outline(ascii_pdf_file, 1, debugf)
338 self.outline_contents = []
339 log ("number of pages: %s, post ascii: %s" %
340 (number_of_pages, number_of_ascii_pages))
341 for ascii_title, depth, pageno in ascii_contents:
342 if ascii_title[-4:] == '&#0;': #stupid [something] puts this in
343 ascii_title = ascii_title[:-4]
344 if ' ' in ascii_title:
345 ascii_title = ascii_title.rsplit(' ', 1)[1]
346 title = titlemap.get(ascii_title, '')
347 log((ascii_title, title, depth, pageno))
349 self.outline_contents.append((title, depth, pageno))
350 else:
351 for x in self.outline_contents:
352 log(x)
354 self.notify_watcher()
355 return number_of_pages
357 def make_body_pdf(self):
358 """Make a pdf of the HTML, using webkit"""
359 #1. Save the html
360 html_text = etree.tostring(self.tree, method="html")
361 self.save_data(self.body_html_file, html_text)
363 #2. Make a pdf of it
364 self.maker.make_raw_pdf(self.body_html_file, self.body_pdf_file, outline=True)
365 self.notify_watcher('generate_pdf')
367 n_pages = self.extract_pdf_outline()
369 log ("found %s pages in pdf" % n_pages)
370 #4. resize pages, shift gutters, even pages
371 self.maker.reshape_pdf(self.body_pdf_file, self.dir, centre_end=True)
372 self.notify_watcher('reshape_pdf')
374 #5 add page numbers
375 self.maker.number_pdf(self.body_pdf_file, n_pages, dir=self.dir,
376 numbers=self.page_numbers)
377 self.notify_watcher("number_pdf")
378 self.notify_watcher()
380 def make_preamble_pdf(self):
381 contents = self.make_contents()
382 inside_cover_html = self.compose_inside_cover()
383 log(self.dir, self.css_url, self.title, inside_cover_html,
384 self.toc_header, contents, self.title)
386 html = ('<html dir="%s"><head>\n'
387 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
388 '<link rel="stylesheet" href="%s" />\n'
389 '</head>\n<body>\n'
390 '<h1 class="frontpage">%s</h1>'
391 '%s\n'
392 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
393 '<div style="page-break-after: always; color:#fff" class="unseen">.'
394 '<!--%s--></div></body></html>'
395 ) % (self.dir, self.css_url, self.title, inside_cover_html.decode('utf-8'),
396 self.toc_header, contents, self.title)
397 self.save_data(self.preamble_html_file, html)
399 self.maker.make_raw_pdf(self.preamble_html_file, self.preamble_pdf_file)
401 self.maker.reshape_pdf(self.preamble_pdf_file, self.dir, centre_start=True)
403 self.maker.number_pdf(self.preamble_pdf_file, None, dir=self.dir,
404 numbers=self.preamble_page_numbers,
405 number_start=-2)
407 self.notify_watcher()
409 def make_end_matter_pdf(self):
410 """Make an inside back cover and a back cover. If there is an
411 isbn number its barcode will be put on the back cover."""
412 if self.isbn:
413 self.isbn_pdf_file = self.filepath('isbn.pdf')
414 self.maker.make_barcode_pdf(self.isbn, self.isbn_pdf_file)
415 self.notify_watcher('make_barcode_pdf')
417 end_matter = self.compose_end_matter()
418 log(end_matter)
419 self.save_data(self.tail_html_file, end_matter.decode('utf-8'))
420 self.maker.make_raw_pdf(self.tail_html_file, self.tail_pdf_file)
422 self.maker.reshape_pdf(self.tail_pdf_file, self.dir, centre_start=True,
423 centre_end=True, even_pages=False)
424 self.notify_watcher()
426 def make_book_pdf(self):
427 """A convenient wrapper of a few necessary steps"""
428 # now the Xvfb server is needed. make sure it has had long enough to get going
429 self.wait_for_xvfb()
430 self.make_body_pdf()
431 self.make_preamble_pdf()
432 self.make_end_matter_pdf()
434 concat_pdfs(self.pdf_file, self.preamble_pdf_file,
435 self.body_pdf_file, self.tail_pdf_file,
436 self.isbn_pdf_file)
438 self.notify_watcher('concatenated_pdfs')
441 def make_simple_pdf(self, mode):
442 """Make a simple pdf document without contents or separate
443 title page. This is used for multicolumn newspapers and for
444 web-destined pdfs."""
445 self.wait_for_xvfb()
446 #0. Add heading to begining of html
447 body = list(self.tree.cssselect('body'))[0]
448 e = body.makeelement('h1', {'id': 'book-title'})
449 e.text = self.title
450 body.insert(0, e)
451 intro = lxml.html.fragment_fromstring(self.compose_inside_cover())
452 e.addnext(intro)
454 #0.5 adjust parameters to suit the particular kind of output
455 if mode == 'web':
456 self.maker.gutter = 0
458 #1. Save the html
459 html_text = etree.tostring(self.tree, method="html")
460 self.save_data(self.body_html_file, html_text)
462 #2. Make a pdf of it (direct to to final pdf)
463 self.maker.make_raw_pdf(self.body_html_file, self.pdf_file, outline=True)
464 self.notify_watcher('generate_pdf')
465 n_pages = count_pdf_pages(self.pdf_file)
467 if mode != 'web':
468 #3. resize pages and shift gutters.
469 self.maker.reshape_pdf(self.pdf_file, self.dir, centre_end=True)
470 self.notify_watcher('reshape_pdf')
472 #4. add page numbers
473 self.maker.number_pdf(self.pdf_file, n_pages,
474 dir=self.dir, numbers=self.page_numbers)
475 self.notify_watcher("number_pdf")
476 self.notify_watcher()
479 def rotate180(self):
480 """Rotate the pdf 180 degrees so an RTL book can print on LTR
481 presses."""
482 rotated = self.filepath('final-rotate.pdf')
483 unrotated = self.filepath('final-pre-rotate.pdf')
484 #leave the unrotated pdf intact at first, in case of error.
485 rotate_pdf(self.pdf_file, rotated)
486 os.rename(self.pdf_file, unrotated)
487 os.rename(rotated, self.pdf_file)
488 self.notify_watcher()
490 def publish_pdf(self):
491 """Move the finished PDF to its final resting place"""
492 log("Publishing %r as %r" % (self.pdf_file, self.publish_file))
493 os.rename(self.pdf_file, self.publish_file)
494 self.notify_watcher()
497 def concat_html(self):
498 """Join all the chapters together into one tree. Keep the TOC
499 up-to-date along the way."""
501 #each manifest item looks like:
502 #{'contributors': []
503 #'license': [],
504 #'mimetype': '',
505 #'rightsholders': []
506 #'url': ''}
507 doc = lxml.html.document_fromstring('<html><body></body></html>')
508 tocmap = filename_toc_map(self.toc)
509 for ID in self.spine:
510 details = self.manifest[ID]
511 log(ID, pformat(details))
512 # ACO MIJENJAO
513 try:
514 root = self.get_tree_by_id(ID).getroot()
515 except:
516 continue
517 #handle any TOC points in this file
518 for point in tocmap[details['url']]:
519 #if the url has a #identifier, use it. Otherwise, make
520 #one up, using a hidden element at the beginning of
521 #the inserted document.
522 #XXX this will break if different files use the same ids
523 #XXX should either replace all, or replace selectively.
524 if point['fragment']:
525 fragment = point['fragment']
526 else:
527 body = _find_tag(root, 'body')
528 fragment = '%s_%s' % (self.cookie, point['index'])
529 #reuse first tag if it is suitable.
530 if (len(body) and
531 body[0].tag in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
532 if body[0].get('id') is None:
533 body[0].set('id', fragment)
534 else:
535 fragment = body[0].get('id')
536 #the chapter starts with a heading. that heading should be the chapter name.
537 if body[0].tag in ('h1', 'h2', 'h3'):
538 log('chapter has title "%s", found html title "%s"' %
539 (point['title'], body[0].text_content()))
540 point['html_title'] = body[0].text_content()
541 else:
542 marker = body.makeelement('div', style="display:none",
543 id=fragment)
544 body.insert(0, marker)
545 point['html_id'] = fragment
547 add_guts(root, doc)
548 return doc
550 def unpack_static(self):
551 """Extract static files from the zip for the html to refer to."""
552 static_files = [x['url'] for x in self.manifest.values()
553 if x['url'].startswith('static')]
554 if static_files:
555 os.mkdir(self.filepath('static'))
557 for name in static_files:
558 s = self.store.read(name)
559 f = open(self.filepath(name), 'w')
560 f.write(s)
561 f.close()
562 self.notify_watcher()
564 def load_book(self):
565 """"""
566 #XXX concatenate the HTML to match how TWiki version worked.
567 # This is perhaps foolishly early -- throwing away useful boundaries.
568 self.unpack_static()
569 self.tree = self.concat_html()
570 self.save_tempfile('raw.html', etree.tostring(self.tree, method='html'))
572 self.headings = [x for x in self.tree.cssselect('h1')]
573 if self.headings:
574 self.headings[0].set('class', "first-heading")
575 for h1 in self.headings:
576 h1.title = h1.text_content().strip()
577 self.notify_watcher()
579 def make_contents(self):
580 """Generate HTML containing the table of contents. This can
581 only be done after the main PDF has been made, because the
582 page numbers are contained in the PDF outline."""
583 header = '<h1>Table of Contents</h1><table class="toc">\n'
584 row_tmpl = ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
585 '<td class="pagenumber">%s</td></tr>\n')
586 empty_section_tmpl = ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
587 section_tmpl = ('<tr><td class="section" colspan="3">%s</td></tr>\n')
588 footer = '\n</table>'
590 contents = []
592 chapter = 1
593 page_num = 1
594 subsections = [] # for the subsection heading pages.
596 outline_contents = iter(self.outline_contents)
597 headings = iter(self.headings)
599 for section in self.toc:
600 if not section.get('children'):
601 contents.append(empty_section_tmpl % section['title'])
602 continue
603 contents.append(section_tmpl % section['title'])
605 for point in section['children']:
606 try:
607 h1_text, level, page_num = outline_contents.next()
608 except StopIteration:
609 log("contents data not found for %s. Stopping" % (point,))
610 break
611 contents.append(row_tmpl % (chapter, _get_best_title(point), page_num))
612 chapter += 1
614 doc = header + '\n'.join(contents) + footer
615 self.notify_watcher()
616 return doc
618 def add_section_titles(self):
619 """Add any section heading pages that the TOC.txt file
620 specifies. These are sub-book, super-chapter groupings.
622 Also add initial numbers to chapters.
624 headings = iter(self.headings)
625 chapter = 1
626 section = None
627 log(self.toc)
628 for t in self.toc:
629 #only top level sections get a subsection page,
630 #and only if they have children.
631 if t.get('children'):
632 section = self.tree.makeelement('div', Class="objavi-subsection")
633 heading = etree.SubElement(section, 'div', Class="objavi-subsection-heading")
634 heading.text = t['title']
635 for child in t['children']:
636 item = etree.SubElement(section, 'div', Class="objavi-chapter")
637 if 'html_title' in child:
638 item.text = child['html_title']
639 heading = self.tree.cssselect('#'+ child['html_id'])
640 if heading:
641 _add_initial_number(heading[0], chapter)
642 else:
643 item.text = child['title']
644 _add_initial_number(item, chapter)
645 log(item.text, debug='HTMLGEN')
646 chapter += 1
647 log("#%s is %s" % (t['html_id'], self.tree.cssselect('#'+ t['html_id'])))
648 location = self.tree.cssselect('#'+ t['html_id'])[0]
649 location.addprevious(section)
652 self.notify_watcher()
655 def add_css(self, css=None, mode='book'):
656 """If css looks like a url, use it as a stylesheet link.
657 Otherwise it is the CSS itself, which is saved to a temporary file
658 and linked to."""
659 log("css is %r" % css)
660 htmltree = self.tree
661 if css is None or not css.strip():
662 defaults = config.SERVER_DEFAULTS[self.server]
663 url = 'file://' + os.path.abspath(defaults['css-%s' % mode])
664 elif not re.match(r'^http://\S+$', css):
665 fn = self.save_tempfile('objavi.css', css)
666 url = 'file://' + fn
667 else:
668 url = css
669 #XXX for debugging and perhaps sensible anyway
670 #url = url.replace('file:///home/douglas/objavi2', '')
673 #find the head -- it's probably first child but lets not assume.
674 for child in htmltree:
675 if child.tag == 'head':
676 head = child
677 break
678 else:
679 head = htmltree.makeelement('head')
680 htmltree.insert(0, head)
682 link = etree.SubElement(head, 'link', rel='stylesheet', type='text/css', href=url)
683 self.css_url = url
684 self.notify_watcher()
685 return url
688 def _read_localised_template(self, template, fallbacks=['en']):
689 """Try to get the template in the approriate language, otherwise in english."""
690 for lang in [self.lang] + fallbacks:
691 try:
692 fn = template % (lang)
693 f = open(fn)
694 break
695 except IOError, e:
696 log("couldn't open inside front cover for lang %s (filename %s)" % (lang, fn))
697 log(e)
698 template = f.read()
699 f.close()
700 return template
702 def compose_inside_cover(self):
703 """create the markup for the preamble inside cover."""
704 template = self._read_localised_template(config.INSIDE_FRONT_COVER_TEMPLATE)
706 if self.isbn:
707 isbn_text = '<b>ISBN :</b> %s <br>' % self.isbn
708 else:
709 isbn_text = ''
711 return template % {'date': time.strftime('%Y-%m-%d'),
712 'isbn': isbn_text,
713 'license': self.license,
717 def compose_end_matter(self):
718 """create the markup for the end_matter inside cover. If
719 self.isbn is not set, the html will result in a pdf that
720 spills onto two pages.
722 template = self._read_localised_template(config.END_MATTER_TEMPLATE)
724 d = {'css_url': self.css_url,
725 'title': self.title
728 if self.isbn:
729 d['inside_cover_style'] = ''
730 else:
731 d['inside_cover_style'] = 'page-break-after: always'
733 return template % d
736 def make_epub(self, use_cache=False):
737 """Make an epub version of the book, using Mike McCabe's
738 epub module for the Internet Archive."""
739 ebook = ia_epub.Book(self.publish_file, content_dir='')
740 def add_file(ID, filename, mediatype, content):
741 ebook.add_content({'media-type': mediatype.encode('utf-8'),
742 'id': ID.encode('utf-8'),
743 'href': filename.encode('utf-8'),
744 }, content)
746 toc = self.info['TOC']
748 #manifest
749 filemap = {} #map html to corresponding xhtml
750 spinemap = {} #map IDs to multi-file chapters
751 for ID in self.manifest:
752 details = self.manifest[ID]
753 log(ID, pformat(details))
754 fn, mediatype = details['url'], details['mimetype']
755 content = self.store.read(fn)
756 if mediatype == 'text/html':
757 #convert to application/xhtml+xml, and perhaps split
758 c = EpubChapter(self.server, self.book, ID, content,
759 use_cache=use_cache)
760 c.remove_bad_tags()
761 if fn[-5:] == '.html':
762 fnbase = fn[:-5]
763 else:
764 fnbase = fn
765 fnx = fnbase + '.xhtml'
766 mediatype = 'application/xhtml+xml'
768 fragments = split_html(c.as_xhtml(),
769 compressed_size=self.store.getinfo(fn).compress_size)
771 #add the first one as if it is the whole thing (as it often is)
772 add_file(ID, fnx, mediatype, fragments[0])
773 filemap[fn] = fnx
774 if len(fragments) > 1:
775 spine_ids = [ID]
776 spinemap[ID] = spine_ids
777 #add any extras
778 for i in range(1, len(fragments)):
779 # XXX it is possible for duplicates if another
780 # file happens to have this name. Ignore for now
781 _id = '%s_SONY_WORKAROUND_%s' % (ID, i)
782 spine_ids.append(_id)
783 add_file(_id,
784 '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase, i),
785 mediatype, fragments[i])
787 else:
788 add_file(ID, fn, mediatype, content)
790 #toc
791 ncx = epub_utils.make_ncx(toc, self.metadata, filemap)
792 ebook.add(ebook.content_dir + 'toc.ncx', ncx)
794 #spine
795 for ID in self.spine:
796 if ID in spinemap:
797 for x in spinemap[ID]:
798 ebook.add_spine_item({'idref': x})
799 else:
800 ebook.add_spine_item({'idref': ID})
802 #metadata -- no use of attributes (yet)
803 # and fm: metadata disappears for now
804 DCNS = config.DCNS
805 DC = config.DC
806 meta_info_items = []
807 for ns, namespace in self.metadata.items():
808 for keyword, schemes in namespace.items():
809 if ns:
810 keyword = '{%s}%s' % (ns, keyword)
811 for scheme, values in schemes.items():
812 for value in values:
813 item = {
814 'item': keyword,
815 'text': value,
817 if scheme:
818 if keyword in (DCNS + 'creator', DCNS + 'contributor'):
819 item['atts'] = {'role': scheme}
820 else:
821 item['atts'] = {'scheme': scheme}
823 has_authors = 'creator' in self.metadata[DC]
824 if not has_authors and config.CLAIM_UNAUTHORED:
825 meta_info_items.append({'item': DCNS + 'creator',
826 'text': 'The Contributors'})
828 meta_info_items.append({'item': DCNS + 'rights',
829 'text': 'This book is free. Copyright %s' % (', '.join(authors))}
832 tree_str = ia_epub.make_opf(meta_info_items,
833 ebook.manifest_items,
834 ebook.spine_items,
835 ebook.guide_items,
836 ebook.cover_id)
837 ebook.add(ebook.content_dir + 'content.opf', tree_str)
838 ebook.z.close()
839 self.notify_watcher()
842 def publish_s3(self):
843 """Push the book's epub to archive.org, using S3."""
844 #XXX why only epub?
845 secrets = {}
846 for x in ('S3_SECRET', 'S3_ACCESSKEY'):
847 fn = getattr(config, x)
848 f = open(fn)
849 secrets[x] = f.read().strip()
850 f.close()
852 log(secrets)
853 now = time.strftime('%F')
854 s3output = self.filepath('s3-output.txt')
855 s3url = 'http://s3.us.archive.org/booki-%s/%s' % (self.book, self.bookname)
856 detailsurl = 'http://archive.org/details/booki-%s' % (self.book,)
857 headers = [
858 'x-amz-auto-make-bucket:1',
859 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets,
860 'x-archive-meta-mediatype:texts',
861 'x-archive-meta-collection:opensource',
862 'x-archive-meta-title:%s' %(self.book,),
863 'x-archive-meta-date:%s' % (now,),
864 'x-archive-meta-creator:FLOSS Manuals Contributors',
867 if self.license in config.LICENSES:
868 headers.append('x-archive-meta-licenseurl:%s' % config.LICENSES[self.license])
870 argv = ['curl', '--location', '-s', '-o', s3output]
871 for h in headers:
872 argv.extend(('--header', h))
873 argv.extend(('--upload-file', self.publish_file, s3url,))
875 log(' '.join(repr(x) for x in argv))
876 check_call(argv, stdout=sys.stderr)
877 self.notify_watcher()
878 return detailsurl, s3url
881 def spawn_x(self):
882 """Start an Xvfb instance, using a new server number. A
883 reference to it is stored in self.xvfb, which is used to kill
884 it when the pdf is done.
886 Note that Xvfb doesn't interact well with dbus which is
887 present on modern desktops.
889 #Find an unused server number (in case two cgis are running at once)
890 while True:
891 servernum = random.randrange(50, 500)
892 if not os.path.exists('/tmp/.X%s-lock' % servernum):
893 break
895 self.xserver_no = ':%s' % servernum
897 authfile = self.filepath('Xauthority')
898 os.environ['XAUTHORITY'] = authfile
900 #mcookie(1) eats into /dev/random, so avoid that
901 from hashlib import md5
902 m = md5("%r %r %r %r %r" % (self, os.environ, os.getpid(), time.time(), os.urandom(32)))
903 mcookie = m.hexdigest()
905 check_call(['xauth', 'add', self.xserver_no, '.', mcookie])
907 self.xvfb = Popen(['Xvfb', self.xserver_no,
908 '-screen', '0', '1024x768x24',
909 '-pixdepths', '32',
910 #'-blackpixel', '0',
911 #'-whitepixel', str(2 ** 24 -1),
912 #'+extension', 'Composite',
913 '-dpi', '96',
914 '-kb',
915 '-nolisten', 'tcp',
918 # We need to wait a bit before the Xvfb is ready. but the
919 # downloads are so slow that that probably doesn't matter
921 self.xvfb_ready_time = time.time() + 2
923 os.environ['DISPLAY'] = self.xserver_no
924 log(self.xserver_no)
926 def wait_for_xvfb(self):
927 """wait until a previously set time before continuing. This
928 is so Xvfb has time to properly start."""
929 if hasattr(self, 'xvfb'):
930 d = self.xvfb_ready_time - time.time()
931 if d > 0:
932 time.sleep(d)
933 self.notify_watcher()
935 def cleanup_x(self):
936 """Try very hard to kill off Xvfb. In addition to killing
937 this instance's xvfb, occasionally (randomly) search for
938 escaped Xvfb instances and kill those too."""
939 if not hasattr(self, 'xvfb'):
940 return
941 check_call(['xauth', 'remove', self.xserver_no])
942 p = self.xvfb
943 log("trying to kill Xvfb %s" % p.pid)
944 os.kill(p.pid, 15)
945 for i in range(10):
946 if p.poll() is not None:
947 log("%s died with %s" % (p.pid, p.poll()))
948 break
949 log("%s not dead yet" % p.pid)
950 time.sleep(0.2)
951 else:
952 log("Xvfb would not die! kill -9! kill -9!")
953 os.kill(p.pid, 9)
955 if random.random() < 0.1:
956 # occasionally kill old xvfbs and soffices, if there are any.
957 self.kill_old_processes()
959 def kill_old_processes(self):
960 """Sometimes, despite everything, Xvfb or soffice instances
961 hang around well after they are wanted -- for example if the
962 cgi process dies particularly badly. So kill them if they have
963 been running for a long time."""
964 log("running kill_old_processes")
965 p = Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
966 '-o', 'pid,etime', '--no-headers'], stdout=PIPE)
967 data = p.communicate()[0].strip()
968 if data:
969 lines = data.split('\n')
970 pids = []
971 for line in lines:
972 log('dealing with ps output "%s"' % line)
973 try:
974 pid, days, hours, minutes, seconds \
975 = re.match(r'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line).groups()
976 except AttributeError:
977 log("Couldn't parse that line!")
978 # 50 minutes should be enough xvfb time for anyone
979 if days or hours or int(minutes) > 50:
980 pid = int(pid)
981 log("going to kill pid %s" % pid)
982 os.kill(pid, 15)
983 pids.append(pid)
985 time.sleep(1.0)
986 for pid in pids:
987 #try again in case any are lingerers
988 try:
989 os.kill(int(pid), 9)
990 except OSError, e:
991 log('PID %s seems dead (re-kill gives %s)' % (pid, e))
992 continue
993 log('killing %s with -9' % pid)
994 self.notify_watcher()
996 def cleanup(self):
997 self.cleanup_x()
998 if not config.KEEP_TEMP_FILES:
999 for fn in os.listdir(self.workdir):
1000 os.remove(os.path.join(self.workdir, fn))
1001 os.rmdir(self.workdir)
1002 else:
1003 log("NOT removing '%s', containing the following files:" % self.workdir)
1004 log(*os.listdir(self.workdir))
1006 self.notify_watcher()
1009 def use_cache():
1010 return (os.environ.get('HTTP_HOST') in config.USE_ZIP_CACHE_ALWAYS_HOSTS)
1012 def _read_cached_zip(server, book, max_age):
1013 #find a recent zip if possible
1014 prefix = '%s/%s' % (config.BOOKI_BOOK_DIR, make_book_name(book, server, '').split('-20', 1)[0])
1015 from glob import glob
1016 zips = sorted(glob(prefix + '*.zip'))
1017 if not zips:
1018 log("no cached booki-zips matching %s*.zip" % (prefix,))
1019 return None
1020 zipname = zips[-1]
1021 cutoff = time.time() - max_age * 60
1022 log(repr(zipname))
1023 try:
1024 date = time.mktime(time.strptime(zipname, prefix + '-%Y.%m.%d-%H.%M.%S.zip'))
1025 if date > cutoff:
1026 f = open(zipname)
1027 blob = f.read()
1028 f.close()
1029 return blob
1030 log("%s is too old, must reload" % zipname)
1031 return None
1032 except (IOError, IndexError, ValueError), e:
1033 log('could not make sense of %s: got exception %s' % (zipname, e))
1034 return None
1038 def fetch_zip(server, book, project, save=False, max_age=-1):
1039 interface = config.SERVER_DEFAULTS[server]['interface']
1040 if interface not in ('Booki', 'TWiki'):
1041 raise NotImplementedError("Can't handle '%s' interface" % interface)
1042 if interface == 'Booki':
1043 url = config.BOOKI_ZIP_URL % {'server': server, 'project': project, 'book':book}
1044 else:
1045 url = config.TWIKI_GATEWAY_URL % (HTTP_HOST, server, book)
1047 if use_cache() and max_age < 0:
1048 #default to 12 hours cache on objavi.halo.gen.nz
1049 max_age = 12 * 60
1051 if max_age:
1052 log('WARNING: trying to use cached booki-zip',
1053 'If you are debugging booki-zip creation, you will go CRAZY'
1054 ' unless you switch this off')
1055 blob = _read_cached_zip(server, book, max_age)
1056 if blob is not None:
1057 return blob
1059 log('fetching zip from %s'% url)
1060 f = urlopen(url)
1061 blob = f.read()
1062 f.close()
1063 if save:
1064 zipname = make_book_name(book, server, '.zip')
1065 f = open('%s/%s' % (config.BOOKI_BOOK_DIR, zipname), 'w')
1066 f.write(blob)
1067 f.close()
1068 return blob
1072 def split_html(html, compressed_size=None, xhtmlise=False):
1073 if compressed_size is None:
1074 import zlib
1075 compressed_size = len(zlib.compress(html))
1077 splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX,
1078 len(html) // config.EPUB_FILE_SIZE_MAX)
1079 log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits))
1081 if not splits:
1082 return [html]
1084 if xhtmlise:
1085 #xhtmlisation removes '<' in attributes etc, which makes the
1086 #marker insertion more reliable
1087 html = etree.tostring(lxml.html.fromstring(html),
1088 encoding='UTF-8',
1089 #method='html'
1092 target = len(html) // (splits + 1)
1093 s = 0
1094 fragments = []
1095 for i in range(splits):
1096 e = html.find('<', target * (i + 1))
1097 fragments.append(html[s:e])
1098 fragments.append('<hr class="%s" id="split_%s" />' % (config.MARKER_CLASS, i))
1099 s = e
1100 fragments.append(html[s:])
1101 root = lxml.html.fromstring(''.join(fragments))
1103 # find the node lineages along which to split the document.
1104 # anything outside these lines (i.e., branches) can be copied
1105 # wholesale.
1107 stacks = []
1108 for hr in root.iter(tag='hr'):
1109 if hr.get('class') == config.MARKER_CLASS:
1110 stack = [hr]
1111 stack.extend(x for x in hr.iterancestors())
1112 stack.reverse()
1113 stacks.append(stack)
1115 iterstacks = iter(stacks)
1117 src = root
1118 log('root is', root, root.attrib, type(root.attrib))
1119 dest = lxml.html.Element(root.tag, **dict(root.items()))
1120 doc = dest
1121 stack = iterstacks.next()
1122 marker = stack[-1]
1124 chapters = []
1125 try:
1126 while True:
1127 for e in src:
1128 if e not in stack:
1129 #cut and paste branch
1130 dest.append(e)
1131 elif e is marker:
1132 #got one
1133 src.remove(e)
1134 chapters.append(doc)
1135 src = root
1136 dest = lxml.html.Element(root.tag, **dict(root.items()))
1137 doc = dest
1138 stack = iterstacks.next()
1139 marker = stack[-1]
1140 break
1141 else:
1142 #next level
1143 dest = etree.SubElement(dest, e.tag, **dict(e.items()))
1144 dest.text = e.text
1145 e.text = None
1146 src = e
1147 break
1148 except StopIteration:
1149 #stacks have run out -- the rest of the tree is the last section
1150 chapters.append(src)
1152 #return chapters
1153 return [etree.tostring(c, encoding='UTF-8', method='html') for c in chapters]