1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
27 from subprocess
import Popen
, check_call
, PIPE
28 from cStringIO
import StringIO
29 from urllib2
import urlopen
32 from string
import ascii_letters
33 from pprint
import pformat
36 import simplejson
as json
40 import lxml
, lxml
.html
41 from lxml
import etree
43 from objavi
import config
, epub_utils
44 from objavi
.cgi_utils
import log
, run
, shift_file
, make_book_name
, guess_lang
, guess_text_dir
45 from objavi
.pdf
import PageSettings
, count_pdf_pages
, concat_pdfs
, rotate_pdf
, parse_outline
46 from objavi
.epub
import add_guts
, _find_tag
48 from iarchive
import epub
as ia_epub
49 from booki
.xhtml_utils
import EpubChapter
50 from booki
.bookizip
import get_metadata
, add_metadata
, clear_metadata
, get_metadata_schemes
52 TMPDIR
= os
.path
.abspath(config
.TMPDIR
)
53 DOC_ROOT
= os
.environ
.get('DOCUMENT_ROOT', '.')
54 HTTP_HOST
= os
.environ
.get('HTTP_HOST', '')
55 PUBLISH_PATH
= "%s/books/" % DOC_ROOT
58 def _get_best_title(tocpoint
):
59 if 'html_title' in tocpoint
:
60 return tocpoint
['html_title']
61 if 'title' in tocpoint
:
62 return tocpoint
['title']
66 def _add_initial_number(e
, n
):
67 """Put a styled chapter number n at the beginning of element e."""
68 initial
= e
.makeelement("strong", Class
="initial")
71 if e
.text
is not None:
72 initial
.tail
+= e
.text
74 initial
.text
= "%s." % n
76 def expand_toc(toc
, depth
=1, index
=0):
77 """Reformat toc slightly for convenience"""
79 url
= item
['url'].lstrip('/')
80 bits
= url
.split('#', 1)
82 fragment
= (bits
[1] if len(bits
) == 2 else None)
84 item
["filename"] = filename
85 item
["fragment"] = fragment
88 if 'children' in item
:
89 index
= expand_toc(item
['children'], depth
+ 1, index
)
92 def _serialise(rtoc
, stoc
, depth
):
94 url
= item
['url'].lstrip('/')
95 bits
= url
.split('#', 1)
97 fragment
= (bits
[1] if len(bits
) == 2 else None)
98 stoc
.append({"depth": depth
,
99 "title": item
['title'],
101 "filename": filename
,
102 "fragment": fragment
,
105 if 'children' in item
:
106 _serialise(item
['children'], stoc
, depth
+ 1)
109 def serialise_toc(rtoc
):
110 """Take the recursive TOC structure and turn it into a list of
111 serial points. Reformat some things for convenience."""
113 _serialise(rtoc
, stoc
, 1)
114 for i
, x
in enumerate(stoc
):
118 def filename_toc_map(rtoc
):
124 tocmap
.setdefault(point
['filename'], []).append(point
)
125 if 'children' in point
:
126 traverse(point
['children'])
132 page_numbers
= 'latin'
133 preamble_page_numbers
= 'roman'
135 def notify_watcher(self
, message
=None):
138 #message is the name of the caller
139 message
= traceback
.extract_stack(None, 2)[0][2]
140 log("notify_watcher called with '%s'" % message
)
141 self
.watcher(message
)
146 def __exit__(self
, exc_type
, exc_value
, traceback
):
148 #could deal with exceptions here and return true
151 def __init__(self
, book
, server
, bookname
, project
=None,
152 page_settings
=None, watcher
=None, isbn
=None,
153 license
=config
.DEFAULT_LICENSE
, title
=None,
155 log("*** Starting new book %s ***" % bookname
,
156 "starting zipbook with", server
, book
, project
)
157 self
.watcher
= watcher
158 self
.notify_watcher('start')
159 self
.bookname
= bookname
162 self
.project
= project
163 self
.cookie
= ''.join(random
.sample(ascii_letters
, 10))
165 blob
= fetch_zip(server
, book
, project
, save
=True, max_age
=max_age
)
168 traceback
.print_exc()
169 self
.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e
.url
, e
.code
, e
.msg
))
173 self
.notify_watcher('fetch_zip')
174 self
.store
= zipfile
.ZipFile(f
, 'r')
175 self
.info
= json
.loads(self
.store
.read('info.json'))
176 for k
in ('manifest', 'metadata', 'spine', 'TOC'):
177 if k
not in self
.info
:
178 raise ObjaviError('info.json of %s lacks vital element "%s"' %
182 self
.metadata
= self
.info
['metadata']
183 self
.spine
= self
.info
['spine']
184 self
.manifest
= self
.info
['manifest']
186 if server
== config
.LOCALHOST
: # [DEPRECATED]
187 server
= get_metadata(self
.metadata
, 'server', ns
=config
.FM
, default
=[server
])[0]
188 book
= get_metadata(self
.metadata
, 'book', ns
=config
.FM
, default
=[book
])[0]
190 log(pformat(self
.metadata
))
191 self
.lang
= get_metadata(self
.metadata
, 'language', default
=[None])[0]
193 self
.lang
= guess_lang(server
, book
)
194 log('guessed lang as %s' % self
.lang
)
196 self
.toc_header
= get_metadata(self
.metadata
, 'toc_header', ns
=config
.FM
, default
=[None])[0]
197 if not self
.toc_header
:
198 self
.toc_header
= config
.SERVER_DEFAULTS
[server
]['toc_header']
200 self
.dir = get_metadata(self
.metadata
, 'dir', ns
=config
.FM
, default
=[None])[0]
202 self
.dir = guess_text_dir(server
, book
)
205 #Patch in the extra metadata. (lang and dir may be set from config)
206 #these should be read from zip -- so should go into zip?
207 for var
, key
, scheme
, ns
in (
208 (isbn
, 'id', 'ISBN', config
.DC
),
209 (license
, 'rights', 'License', config
.DC
),
210 (title
, 'title', '', config
.DC
),
211 (self
.lang
, 'language', '', config
.DC
),
212 (self
.dir, 'dir', '', config
.FM
),
215 add_metadata(self
.metadata
, key
, var
, scheme
=scheme
, ns
=ns
)
217 self
.isbn
= get_metadata(self
.metadata
, 'id', scheme
='ISBN', default
=[None])[0]
218 self
.license
= get_metadata(self
.metadata
, 'rights', scheme
='License', default
=[None])[0]
220 self
.toc
= self
.info
['TOC']
223 self
.workdir
= tempfile
.mkdtemp(prefix
=bookname
, dir=TMPDIR
)
224 os
.chmod(self
.workdir
, 0755)
226 self
.body_html_file
= self
.filepath('body.html')
227 self
.body_pdf_file
= self
.filepath('body.pdf')
228 self
.preamble_html_file
= self
.filepath('preamble.html')
229 self
.preamble_pdf_file
= self
.filepath('preamble.pdf')
230 self
.tail_html_file
= self
.filepath('tail.html')
231 self
.tail_pdf_file
= self
.filepath('tail.pdf')
232 self
.isbn_pdf_file
= None
233 self
.pdf_file
= self
.filepath('final.pdf')
234 self
.body_odt_file
= self
.filepath('body.odt')
236 self
.publish_file
= os
.path
.join(PUBLISH_PATH
, bookname
)
237 self
.publish_url
= os
.path
.join(config
.PUBLISH_URL
, bookname
)
239 if page_settings
is not None:
240 self
.maker
= PageSettings(**page_settings
)
242 titles
= get_metadata(self
.metadata
, 'title')
244 self
.title
= titles
[0]
246 self
.title
= 'A Manual About ' + self
.book
248 self
.notify_watcher()
251 if config
.TRY_BOOK_CLEANUP_ON_DEL
:
252 #Dont even define __del__ if it is not used.
253 _try_cleanup_on_del
= True
255 if self
._try
_cleanup
_on
_del
and os
.path
.exists(self
.workdir
):
256 self
._try
_cleanup
_on
_del
= False #or else you can get in bad cycles
259 def get_tree_by_id(self
, id):
260 """get an HTML tree from the given manifest ID"""
261 name
= self
.manifest
[id]['url']
262 mimetype
= self
.manifest
[id]['mimetype']
263 s
= self
.store
.read(name
)
265 if mimetype
== 'text/html':
267 tree
= lxml
.html
.parse(f
)
268 except etree
.XMLSyntaxError
, e
:
269 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
270 (id, name
, s
[:20], e
))
271 tree
= lxml
.html
.document_fromstring('<html><body></body></html>').getroottree()
272 elif 'xml' in mimetype
: #XXX or is this just asking for trouble?
273 tree
= etree
.parse(f
)
279 def filepath(self
, fn
):
280 return os
.path
.join(self
.workdir
, fn
)
282 def save_data(self
, fn
, data
):
283 """Save without tripping up on unicode"""
284 if isinstance(data
, unicode):
285 data
= data
.encode('utf8', 'ignore')
290 def save_tempfile(self
, fn
, data
):
291 """Save the data in a temporary directory that will be cleaned
292 up when all is done. Return the absolute file path."""
293 fn
= self
.filepath(fn
)
294 self
.save_data(fn
, data
)
297 def make_oo_doc(self
):
298 """Make an openoffice document, using the html2odt script."""
300 html_text
= etree
.tostring(self
.tree
, method
="html")
301 self
.save_data(self
.body_html_file
, html_text
)
302 run([config
.HTML2ODT
, self
.workdir
, self
.body_html_file
, self
.body_odt_file
])
303 log("Publishing %r as %r" % (self
.body_odt_file
, self
.publish_file
))
304 os
.rename(self
.body_odt_file
, self
.publish_file
)
305 self
.notify_watcher()
307 def extract_pdf_outline(self
):
308 #self.outline_contents, self.outline_text, number_of_pages = parse_outline(self.body_pdf_file, 1)
309 debugf
= self
.filepath('outline.txt')
310 self
.outline_contents
, self
.outline_text
, number_of_pages
= \
311 parse_outline(self
.body_pdf_file
, 1, debugf
)
313 if not self
.outline_contents
:
314 #probably problems with international text. need a horrible hack
315 log('no outline: trying again with ascii headings')
317 tree
= copy
.deepcopy(self
.tree
)
319 for tag
in ('h1', 'h2', 'h3', 'h4'):
320 for i
, e
in enumerate(tree
.getiterator(tag
)):
321 key
= "%s_%s" % (tag
, i
)
322 titlemap
[key
] = e
.text_content().strip(config
.WHITESPACE_AND_NULL
)
325 e
= lxml
.etree
.SubElement(e
, "strong", Class
="initial")
327 log("key: %r, text: %r, value: %r" %(key
, e
.text
, titlemap
[key
]))
329 ascii_html_file
= self
.filepath('body-ascii-headings.html')
330 ascii_pdf_file
= self
.filepath('body-ascii-headings.pdf')
331 html_text
= lxml
.etree
.tostring(tree
, method
="html")
332 self
.save_data(ascii_html_file
, html_text
)
333 self
.maker
.make_raw_pdf(ascii_html_file
, ascii_pdf_file
, outline
=True)
334 debugf
= self
.filepath('ascii_outline.txt')
335 ascii_contents
, ascii_text
, number_of_ascii_pages
= \
336 parse_outline(ascii_pdf_file
, 1, debugf
)
337 self
.outline_contents
= []
338 log ("number of pages: %s, post ascii: %s" %
339 (number_of_pages
, number_of_ascii_pages
))
340 for ascii_title
, depth
, pageno
in ascii_contents
:
341 if ascii_title
[-4:] == '�': #stupid [something] puts this in
342 ascii_title
= ascii_title
[:-4]
343 if ' ' in ascii_title
:
344 ascii_title
= ascii_title
.rsplit(' ', 1)[1]
345 title
= titlemap
.get(ascii_title
, '')
346 log((ascii_title
, title
, depth
, pageno
))
348 self
.outline_contents
.append((title
, depth
, pageno
))
350 for x
in self
.outline_contents
:
353 self
.notify_watcher()
354 return number_of_pages
356 def make_body_pdf(self
):
357 """Make a pdf of the HTML, using webkit"""
359 html_text
= etree
.tostring(self
.tree
, method
="html")
360 self
.save_data(self
.body_html_file
, html_text
)
363 self
.maker
.make_raw_pdf(self
.body_html_file
, self
.body_pdf_file
, outline
=True)
364 self
.notify_watcher('generate_pdf')
366 n_pages
= self
.extract_pdf_outline()
368 log ("found %s pages in pdf" % n_pages
)
369 #4. resize pages, shift gutters, even pages
370 self
.maker
.reshape_pdf(self
.body_pdf_file
, self
.dir, centre_end
=True)
371 self
.notify_watcher('reshape_pdf')
374 self
.maker
.number_pdf(self
.body_pdf_file
, n_pages
, dir=self
.dir,
375 numbers
=self
.page_numbers
)
376 self
.notify_watcher("number_pdf")
377 self
.notify_watcher()
379 def make_preamble_pdf(self
):
380 contents
= self
.make_contents()
381 inside_cover_html
= self
.compose_inside_cover()
382 log(self
.dir, self
.css_url
, self
.title
, inside_cover_html
,
383 self
.toc_header
, contents
, self
.title
)
385 html
= ('<html dir="%s"><head>\n'
386 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
387 '<link rel="stylesheet" href="%s" />\n'
389 '<h1 class="frontpage">%s</h1>'
391 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
392 '<div style="page-break-after: always; color:#fff" class="unseen">.'
393 '<!--%s--></div></body></html>'
394 ) % (self
.dir, self
.css_url
, self
.title
, inside_cover_html
.decode('utf-8'),
395 self
.toc_header
, contents
, self
.title
)
396 self
.save_data(self
.preamble_html_file
, html
)
398 self
.maker
.make_raw_pdf(self
.preamble_html_file
, self
.preamble_pdf_file
)
400 self
.maker
.reshape_pdf(self
.preamble_pdf_file
, self
.dir, centre_start
=True)
402 self
.maker
.number_pdf(self
.preamble_pdf_file
, None, dir=self
.dir,
403 numbers
=self
.preamble_page_numbers
,
406 self
.notify_watcher()
408 def make_end_matter_pdf(self
):
409 """Make an inside back cover and a back cover. If there is an
410 isbn number its barcode will be put on the back cover."""
412 self
.isbn_pdf_file
= self
.filepath('isbn.pdf')
413 self
.maker
.make_barcode_pdf(self
.isbn
, self
.isbn_pdf_file
)
414 self
.notify_watcher('make_barcode_pdf')
416 end_matter
= self
.compose_end_matter()
418 self
.save_data(self
.tail_html_file
, end_matter
.decode('utf-8'))
419 self
.maker
.make_raw_pdf(self
.tail_html_file
, self
.tail_pdf_file
)
421 self
.maker
.reshape_pdf(self
.tail_pdf_file
, self
.dir, centre_start
=True,
422 centre_end
=True, even_pages
=False)
423 self
.notify_watcher()
425 def make_book_pdf(self
):
426 """A convenient wrapper of a few necessary steps"""
427 # now the Xvfb server is needed. make sure it has had long enough to get going
430 self
.make_preamble_pdf()
431 self
.make_end_matter_pdf()
433 concat_pdfs(self
.pdf_file
, self
.preamble_pdf_file
,
434 self
.body_pdf_file
, self
.tail_pdf_file
,
437 self
.notify_watcher('concatenated_pdfs')
440 def make_simple_pdf(self
, mode
):
441 """Make a simple pdf document without contents or separate
442 title page. This is used for multicolumn newspapers and for
443 web-destined pdfs."""
445 #0. Add heading to begining of html
446 body
= list(self
.tree
.cssselect('body'))[0]
447 e
= body
.makeelement('h1', {'id': 'book-title'})
450 intro
= lxml
.html
.fragment_fromstring(self
.compose_inside_cover())
453 #0.5 adjust parameters to suit the particular kind of output
455 self
.maker
.gutter
= 0
458 html_text
= etree
.tostring(self
.tree
, method
="html")
459 self
.save_data(self
.body_html_file
, html_text
)
461 #2. Make a pdf of it (direct to to final pdf)
462 self
.maker
.make_raw_pdf(self
.body_html_file
, self
.pdf_file
, outline
=True)
463 self
.notify_watcher('generate_pdf')
464 n_pages
= count_pdf_pages(self
.pdf_file
)
467 #3. resize pages and shift gutters.
468 self
.maker
.reshape_pdf(self
.pdf_file
, self
.dir, centre_end
=True)
469 self
.notify_watcher('reshape_pdf')
472 self
.maker
.number_pdf(self
.pdf_file
, n_pages
,
473 dir=self
.dir, numbers
=self
.page_numbers
)
474 self
.notify_watcher("number_pdf")
475 self
.notify_watcher()
479 """Rotate the pdf 180 degrees so an RTL book can print on LTR
481 rotated
= self
.filepath('final-rotate.pdf')
482 unrotated
= self
.filepath('final-pre-rotate.pdf')
483 #leave the unrotated pdf intact at first, in case of error.
484 rotate_pdf(self
.pdf_file
, rotated
)
485 os
.rename(self
.pdf_file
, unrotated
)
486 os
.rename(rotated
, self
.pdf_file
)
487 self
.notify_watcher()
489 def publish_pdf(self
):
490 """Move the finished PDF to its final resting place"""
491 log("Publishing %r as %r" % (self
.pdf_file
, self
.publish_file
))
492 os
.rename(self
.pdf_file
, self
.publish_file
)
493 self
.notify_watcher()
496 def concat_html(self
):
497 """Join all the chapters together into one tree. Keep the TOC
498 up-to-date along the way."""
500 #each manifest item looks like:
506 doc
= lxml
.html
.document_fromstring('<html><body></body></html>')
507 tocmap
= filename_toc_map(self
.toc
)
508 for ID
in self
.spine
:
509 details
= self
.manifest
[ID
]
510 log(ID
, pformat(details
))
511 root
= self
.get_tree_by_id(ID
).getroot()
512 #handle any TOC points in this file
513 for point
in tocmap
[details
['url']]:
514 #if the url has a #identifier, use it. Otherwise, make
515 #one up, using a hidden element at the beginning of
516 #the inserted document.
517 #XXX this will break if different files use the same ids
518 #XXX should either replace all, or replace selectively.
519 if point
['fragment']:
520 fragment
= point
['fragment']
522 body
= _find_tag(root
, 'body')
523 fragment
= '%s_%s' % (self
.cookie
, point
['index'])
524 #reuse first tag if it is suitable.
526 body
[0].tag
in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
527 if body
[0].get('id') is None:
528 body
[0].set('id', fragment
)
530 fragment
= body
[0].get('id')
531 #the chapter starts with a heading. that heading should be the chapter name.
532 if body
[0].tag
in ('h1', 'h2', 'h3'):
533 log('chapter has title "%s", found html title "%s"' %
534 (point
['title'], body
[0].text_content()))
535 point
['html_title'] = body
[0].text_content()
537 marker
= body
.makeelement('div', style
="display:none",
539 body
.insert(0, marker
)
540 point
['html_id'] = fragment
545 def unpack_static(self
):
546 """Extract static files from the zip for the html to refer to."""
547 static_files
= [x
['url'] for x
in self
.manifest
.values()
548 if x
['url'].startswith('static')]
550 os
.mkdir(self
.filepath('static'))
552 for name
in static_files
:
553 s
= self
.store
.read(name
)
554 f
= open(self
.filepath(name
), 'w')
557 self
.notify_watcher()
561 #XXX concatenate the HTML to match how TWiki version worked.
562 # This is perhaps foolishly early -- throwing away useful boundaries.
564 self
.tree
= self
.concat_html()
565 self
.save_tempfile('raw.html', etree
.tostring(self
.tree
, method
='html'))
567 self
.headings
= [x
for x
in self
.tree
.cssselect('h1')]
569 self
.headings
[0].set('class', "first-heading")
570 for h1
in self
.headings
:
571 h1
.title
= h1
.text_content().strip()
572 self
.notify_watcher()
574 def make_contents(self
):
575 """Generate HTML containing the table of contents. This can
576 only be done after the main PDF has been made, because the
577 page numbers are contained in the PDF outline."""
578 header
= '<h1>Table of Contents</h1><table class="toc">\n'
579 row_tmpl
= ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
580 '<td class="pagenumber">%s</td></tr>\n')
581 empty_section_tmpl
= ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
582 section_tmpl
= ('<tr><td class="section" colspan="3">%s</td></tr>\n')
583 footer
= '\n</table>'
589 subsections
= [] # for the subsection heading pages.
591 outline_contents
= iter(self
.outline_contents
)
592 headings
= iter(self
.headings
)
594 for section
in self
.toc
:
595 if not section
.get('children'):
596 contents
.append(empty_section_tmpl
% section
['title'])
598 contents
.append(section_tmpl
% section
['title'])
600 for point
in section
['children']:
602 h1_text
, level
, page_num
= outline_contents
.next()
603 except StopIteration:
604 log("contents data not found for %s. Stopping" % (point
,))
606 contents
.append(row_tmpl
% (chapter
, _get_best_title(point
), page_num
))
609 doc
= header
+ '\n'.join(contents
) + footer
610 self
.notify_watcher()
613 def add_section_titles(self
):
614 """Add any section heading pages that the TOC.txt file
615 specifies. These are sub-book, super-chapter groupings.
617 Also add initial numbers to chapters.
619 headings
= iter(self
.headings
)
624 #only top level sections get a subsection page,
625 #and only if they have children.
626 if t
.get('children'):
627 section
= self
.tree
.makeelement('div', Class
="objavi-subsection")
628 heading
= etree
.SubElement(section
, 'div', Class
="objavi-subsection-heading")
629 heading
.text
= t
['title']
630 for child
in t
['children']:
631 item
= etree
.SubElement(section
, 'div', Class
="objavi-chapter")
632 if 'html_title' in child
:
633 item
.text
= child
['html_title']
634 heading
= self
.tree
.cssselect('#'+ child
['html_id'])
636 _add_initial_number(heading
[0], chapter
)
638 item
.text
= child
['title']
639 _add_initial_number(item
, chapter
)
640 log(item
.text
, debug
='HTMLGEN')
642 location
= self
.tree
.cssselect('#'+ t
['html_id'])[0]
643 location
.addprevious(section
)
646 self
.notify_watcher()
649 def add_css(self
, css
=None, mode
='book'):
650 """If css looks like a url, use it as a stylesheet link.
651 Otherwise it is the CSS itself, which is saved to a temporary file
653 log("css is %r" % css
)
655 if css
is None or not css
.strip():
656 defaults
= config
.SERVER_DEFAULTS
[self
.server
]
657 url
= 'file://' + os
.path
.abspath(defaults
['css-%s' % mode
])
658 elif not re
.match(r
'^http://\S+$', css
):
659 fn
= self
.save_tempfile('objavi.css', css
)
663 #XXX for debugging and perhaps sensible anyway
664 #url = url.replace('file:///home/douglas/objavi2', '')
667 #find the head -- it's probably first child but lets not assume.
668 for child
in htmltree
:
669 if child
.tag
== 'head':
673 head
= htmltree
.makeelement('head')
674 htmltree
.insert(0, head
)
676 link
= etree
.SubElement(head
, 'link', rel
='stylesheet', type='text/css', href
=url
)
678 self
.notify_watcher()
682 def _read_localised_template(self
, template
, fallbacks
=['en']):
683 """Try to get the template in the approriate language, otherwise in english."""
684 for lang
in [self
.lang
] + fallbacks
:
686 fn
= template
% (lang
)
690 log("couldn't open inside front cover for lang %s (filename %s)" % (lang
, fn
))
696 def compose_inside_cover(self
):
697 """create the markup for the preamble inside cover."""
698 template
= self
._read
_localised
_template
(config
.INSIDE_FRONT_COVER_TEMPLATE
)
701 isbn_text
= '<b>ISBN :</b> %s <br>' % self
.isbn
705 return template
% {'date': time
.strftime('%Y-%m-%d'),
707 'license': self
.license
,
711 def compose_end_matter(self
):
712 """create the markup for the end_matter inside cover. If
713 self.isbn is not set, the html will result in a pdf that
714 spills onto two pages.
716 template
= self
._read
_localised
_template
(config
.END_MATTER_TEMPLATE
)
718 d
= {'css_url': self
.css_url
,
723 d
['inside_cover_style'] = ''
725 d
['inside_cover_style'] = 'page-break-after: always'
730 def make_epub(self
, use_cache
=False):
731 """Make an epub version of the book, using Mike McCabe's
732 epub module for the Internet Archive."""
733 ebook
= ia_epub
.Book(self
.publish_file
, content_dir
='')
734 toc
= self
.info
['TOC']
737 filemap
= {} #map html to corresponding xhtml
738 for ID
in self
.manifest
:
739 details
= self
.manifest
[ID
]
740 log(ID
, pformat(details
))
741 fn
, mediatype
= details
['url'], details
['mimetype']
743 content
= self
.store
.read(fn
)
744 if mediatype
== 'text/html':
746 #convert to application/xhtml+xml
747 c
= EpubChapter(self
.server
, self
.book
, ID
, content
,
751 content
= c
.as_xhtml()
752 if fn
[-5:] == '.html':
755 mediatype
= 'application/xhtml+xml'
758 info
= {'id': ID
.encode('utf-8'),
759 'href': fn
.encode('utf-8'),
760 'media-type': mediatype
.encode('utf-8')}
761 ebook
.add_content(info
, content
)
764 ncx
= epub_utils
.make_ncx(toc
, self
.metadata
, filemap
)
765 ebook
.add(ebook
.content_dir
+ 'toc.ncx', ncx
)
768 for ID
in self
.spine
:
769 ebook
.add_spine_item({'idref': ID
})
771 #metadata -- no use of attributes (yet)
772 # and fm: metadata disappears for now
776 for ns
, namespace
in self
.metadata
.items():
777 for keyword
, schemes
in namespace
.items():
779 keyword
= '{%s}%s' % (ns
, keyword
)
780 for scheme
, values
in schemes
.items():
787 if keyword
in (DCNS
+ 'creator', DCNS
+ 'contributor'):
788 item
['atts'] = {'role': scheme
}
790 item
['atts'] = {'scheme': scheme
}
792 has_authors
= 'creator' in self
.metadata
[DC
]
793 if not has_authors
and config
.CLAIM_UNAUTHORED
:
794 meta_info_items
.append({'item': DCNS
+ 'creator',
795 'text': 'The Contributors'})
797 meta_info_items
.append({'item': DCNS
+ 'rights',
798 'text': 'This book is free. Copyright %s' % (', '.join(authors
))}
801 tree_str
= ia_epub
.make_opf(meta_info_items
,
802 ebook
.manifest_items
,
806 ebook
.add(ebook
.content_dir
+ 'content.opf', tree_str
)
810 def publish_s3(self
):
811 """Push the book's epub to archive.org, using S3."""
814 for x
in ('S3_SECRET', 'S3_ACCESSKEY'):
815 fn
= getattr(config
, x
)
817 secrets
[x
] = f
.read().strip()
821 now
= time
.strftime('%F')
822 s3output
= self
.filepath('s3-output.txt')
823 s3url
= 'http://s3.us.archive.org/booki-%s-%s/%s' % (self
.project
, self
.book
, self
.bookname
)
824 detailsurl
= 'http://archive.org/details/booki-%s-%s' % (self
.project
, self
.book
)
826 'x-amz-auto-make-bucket:1',
827 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets
,
828 'x-archive-meta-mediatype:texts',
829 'x-archive-meta-collection:opensource',
830 'x-archive-meta-title:%s' %(self
.book
,),
831 'x-archive-meta-date:%s' % (now
,),
832 'x-archive-meta-creator:FLOSS Manuals Contributors',
835 if self
.license
in config
.LICENSES
:
836 headers
.append('x-archive-meta-licenseurl:%s' % config
.LICENSES
[self
.license
])
838 argv
= ['curl', '--location', '-s', '-o', s3output
]
840 argv
.extend(('--header', h
))
841 argv
.extend(('--upload-file', self
.publish_file
, s3url
,))
843 log(' '.join(repr(x
) for x
in argv
))
844 check_call(argv
, stdout
=sys
.stderr
)
845 return detailsurl
, s3url
849 """Start an Xvfb instance, using a new server number. A
850 reference to it is stored in self.xvfb, which is used to kill
851 it when the pdf is done.
853 Note that Xvfb doesn't interact well with dbus which is
854 present on modern desktops.
856 #Find an unused server number (in case two cgis are running at once)
858 servernum
= random
.randrange(50, 500)
859 if not os
.path
.exists('/tmp/.X%s-lock' % servernum
):
862 self
.xserver_no
= ':%s' % servernum
864 authfile
= self
.filepath('Xauthority')
865 os
.environ
['XAUTHORITY'] = authfile
867 #mcookie(1) eats into /dev/random, so avoid that
868 from hashlib
import md5
869 m
= md5("%r %r %r %r %r" % (self
, os
.environ
, os
.getpid(), time
.time(), os
.urandom(32)))
870 mcookie
= m
.hexdigest()
872 check_call(['xauth', 'add', self
.xserver_no
, '.', mcookie
])
874 self
.xvfb
= Popen(['Xvfb', self
.xserver_no
,
875 '-screen', '0', '1024x768x24',
878 #'-whitepixel', str(2 ** 24 -1),
879 #'+extension', 'Composite',
885 # We need to wait a bit before the Xvfb is ready. but the
886 # downloads are so slow that that probably doesn't matter
888 self
.xvfb_ready_time
= time
.time() + 2
890 os
.environ
['DISPLAY'] = self
.xserver_no
893 def wait_for_xvfb(self
):
894 """wait until a previously set time before continuing. This
895 is so Xvfb has time to properly start."""
896 if hasattr(self
, 'xvfb'):
897 d
= self
.xvfb_ready_time
- time
.time()
900 self
.notify_watcher()
903 """Try very hard to kill off Xvfb. In addition to killing
904 this instance's xvfb, occasionally (randomly) search for
905 escaped Xvfb instances and kill those too."""
906 if not hasattr(self
, 'xvfb'):
908 check_call(['xauth', 'remove', self
.xserver_no
])
910 log("trying to kill Xvfb %s" % p
.pid
)
913 if p
.poll() is not None:
914 log("%s died with %s" % (p
.pid
, p
.poll()))
916 log("%s not dead yet" % p
.pid
)
919 log("Xvfb would not die! kill -9! kill -9!")
922 if random
.random() < 0.1:
923 # occasionally kill old xvfbs and soffices, if there are any.
924 self
.kill_old_processes()
926 def kill_old_processes(self
):
927 """Sometimes, despite everything, Xvfb or soffice instances
928 hang around well after they are wanted -- for example if the
929 cgi process dies particularly badly. So kill them if they have
930 been running for a long time."""
931 log("running kill_old_processes")
932 p
= Popen(['ps', '-C' 'Xvfb soffice soffice.bin html2odt ooffice wkhtmltopdf',
933 '-o', 'pid,etime', '--no-headers'], stdout
=PIPE
)
934 data
= p
.communicate()[0].strip()
936 lines
= data
.split('\n')
939 log('dealing with ps output "%s"' % line
)
941 pid
, days
, hours
, minutes
, seconds \
942 = re
.match(r
'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line
).groups()
943 except AttributeError:
944 log("Couldn't parse that line!")
945 # 50 minutes should be enough xvfb time for anyone
946 if days
or hours
or int(minutes
) > 50:
948 log("going to kill pid %s" % pid
)
954 #try again in case any are lingerers
958 log('PID %s seems dead (re-kill gives %s)' % (pid
, e
))
960 log('killing %s with -9' % pid
)
961 self
.notify_watcher()
965 if not config
.KEEP_TEMP_FILES
:
966 for fn
in os
.listdir(self
.workdir
):
967 os
.remove(os
.path
.join(self
.workdir
, fn
))
968 os
.rmdir(self
.workdir
)
970 log("NOT removing '%s', containing the following files:" % self
.workdir
)
971 log(*os
.listdir(self
.workdir
))
973 self
.notify_watcher()
977 return (os
.environ
.get('HTTP_HOST') in config
.USE_ZIP_CACHE_ALWAYS_HOSTS
)
979 def _read_cached_zip(server
, book
, max_age
):
980 #find a recent zip if possible
981 prefix
= '%s/%s' % (config
.BOOKI_BOOK_DIR
, make_book_name(book
, server
, '').split('-20', 1)[0])
982 from glob
import glob
983 zips
= sorted(glob(prefix
+ '*.zip'))
985 log("no cached booki-zips matching %s*.zip" % (prefix
,))
988 cutoff
= time
.time() - max_age
* 60
991 date
= time
.mktime(time
.strptime(zipname
, prefix
+ '-%Y.%m.%d-%H.%M.%S.zip'))
997 log("%s is too old, must reload" % zipname
)
999 except (IOError, IndexError, ValueError), e
:
1000 log('could not make sense of %s: got exception %s' % (zipname
, e
))
1005 def fetch_zip(server
, book
, project
, save
=False, max_age
=-1):
1006 interface
= config
.SERVER_DEFAULTS
[server
]['interface']
1007 if interface
not in ('Booki', 'TWiki'):
1008 raise NotImplementedError("Can't handle '%s' interface" % interface
)
1009 if interface
== 'Booki':
1010 url
= config
.BOOKI_ZIP_URL
% {'server': server
, 'project': project
, 'book':book
}
1012 url
= config
.TWIKI_GATEWAY_URL
% (HTTP_HOST
, server
, book
)
1014 if use_cache() and max_age
< 0:
1015 #default to 12 hours cache on objavi.halo.gen.nz
1019 log('WARNING: trying to use cached booki-zip',
1020 'If you are debugging booki-zip creation, you will go CRAZY'
1021 ' unless you switch this off')
1022 blob
= _read_cached_zip(server
, book
, max_age
)
1023 if blob
is not None:
1026 log('fetching zip from %s'% url
)
1031 zipname
= make_book_name(book
, server
, '.zip')
1032 f
= open('%s/%s' % (config
.BOOKI_BOOK_DIR
, zipname
), 'w')