1 # Part of Objavi2, which turns html manuals into books.
2 # This contains classes representing books and coordinates their processing.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Library module representing a complete FM book being turned into a
28 from subprocess
import Popen
, check_call
, PIPE
29 from cStringIO
import StringIO
30 from urllib2
import urlopen
, HTTPError
33 from string
import ascii_letters
34 from pprint
import pformat
39 import simplejson
as json
42 from lxml
import etree
44 from objavi
import config
, epub_utils
45 from objavi
.book_utils
import log
, run
, make_book_name
, guess_lang
, guess_text_dir
46 from objavi
.book_utils
import ObjaviError
, log_types
47 from objavi
.pdf
import PageSettings
, count_pdf_pages
, concat_pdfs
, rotate_pdf
, parse_outline
, parse_extracted_outline
48 from objavi
.epub
import add_guts
, _find_tag
49 from objavi
.xhtml_utils
import EpubChapter
, split_tree
50 from objavi
.cgi_utils
import url2path
, path2url
52 from iarchive
import epub
as ia_epub
53 from booki
.bookizip
import get_metadata
, add_metadata
55 TMPDIR
= os
.path
.abspath(config
.TMPDIR
)
56 DOC_ROOT
= os
.environ
.get('DOCUMENT_ROOT', config
.HTDOCS
)
57 HTTP_HOST
= os
.environ
.get('HTTP_HOST', '')
59 def find_archive_urls(bookid
, bookname
):
60 s3url
= 'http://s3.us.archive.org/booki-%s/%s' % (bookid
, bookname
)
61 detailsurl
= 'http://archive.org/details/booki-%s' % (bookid
,)
62 return (s3url
, detailsurl
)
64 def _get_best_title(tocpoint
):
65 if 'html_title' in tocpoint
:
66 return tocpoint
['html_title']
67 if 'title' in tocpoint
:
68 return tocpoint
['title']
72 def _add_initial_number(e
, n
):
73 """Put a styled chapter number n at the beginning of element e."""
74 initial
= e
.makeelement("strong", Class
="initial")
77 if e
.text
is not None:
78 initial
.tail
+= e
.text
80 initial
.text
= "%s." % n
82 def expand_toc(toc
, depth
=1, index
=0):
83 """Reformat toc slightly for convenience"""
85 url
= item
.get('url', '').lstrip('/')
86 bits
= url
.split('#', 1)
88 fragment
= (bits
[1] if len(bits
) == 2 else None)
90 item
["filename"] = filename
91 item
["fragment"] = fragment
94 if 'children' in item
:
95 index
= expand_toc(item
['children'], depth
+ 1, index
)
98 def _serialise(rtoc
, stoc
, depth
):
100 url
= item
['url'].lstrip('/')
101 bits
= url
.split('#', 1)
103 fragment
= (bits
[1] if len(bits
) == 2 else None)
104 stoc
.append({"depth": depth
,
105 "title": item
['title'],
107 "filename": filename
,
108 "fragment": fragment
,
111 if 'children' in item
:
112 _serialise(item
['children'], stoc
, depth
+ 1)
115 def serialise_toc(rtoc
):
116 """Take the recursive TOC structure and turn it into a list of
117 serial points. Reformat some things for convenience."""
119 _serialise(rtoc
, stoc
, 1)
120 for i
, x
in enumerate(stoc
):
124 def filename_toc_map(rtoc
):
130 tocmap
.setdefault(point
['filename'], []).append(point
)
131 if 'children' in point
:
132 traverse(point
['children'])
136 def save_data(fn
, data
):
137 """Save without tripping up on unicode"""
138 if isinstance(data
, unicode):
139 data
= data
.encode('utf8', 'ignore')
146 page_numbers
= 'latin'
147 preamble_page_numbers
= 'roman'
149 def notify_watcher(self
, message
=None):
152 #message is the name of the caller
153 message
= traceback
.extract_stack(None, 2)[0][2]
154 log("notify_watcher called with '%s'" % message
)
155 for w
in self
.watchers
:
161 def __exit__(self
, exc_type
, exc_value
, tb
):
162 self
.notify_watcher(config
.FINISHED_MESSAGE
)
164 #could deal with exceptions here and return true
167 def __init__(self
, book
, server
, bookname
,
168 page_settings
=None, watchers
=None, isbn
=None,
169 license
=config
.DEFAULT_LICENSE
, title
=None,
171 log("*** Starting new book %s ***" % bookname
)
172 self
.watchers
= set()
173 if watchers
is not None:
174 self
.watchers
.update(watchers
)
175 self
.notify_watcher('start')
176 self
.bookname
= bookname
179 self
.cookie
= ''.join(random
.sample(ascii_letters
, 10))
181 blob
, self
.bookizip_file
= fetch_zip(server
, book
, save
=True, max_age
=max_age
)
183 traceback
.print_exc()
184 self
.notify_watcher("ERROR:\n Couldn't get %r\n %s %s" % (e
.url
, e
.code
, e
.msg
))
186 #raise 502 Bad Gateway ?
189 self
.notify_watcher('fetch_zip')
190 self
.store
= zipfile
.ZipFile(f
, 'r')
191 self
.info
= json
.loads(self
.store
.read('info.json'))
192 for k
in ('manifest', 'metadata', 'spine', 'TOC'):
193 if k
not in self
.info
:
194 raise ObjaviError('info.json of %s lacks vital element "%s"' %
198 self
.metadata
= self
.info
['metadata']
199 self
.spine
= self
.info
['spine']
200 self
.manifest
= self
.info
['manifest']
202 if server
== config
.LOCALHOST
: # [DEPRECATED]
203 server
= get_metadata(self
.metadata
, 'server', ns
=config
.FM
, default
=[server
])[0]
204 book
= get_metadata(self
.metadata
, 'book', ns
=config
.FM
, default
=[book
])[0]
206 log(pformat(self
.metadata
))
207 self
.lang
= get_metadata(self
.metadata
, 'language', default
=[None])[0]
209 self
.lang
= guess_lang(server
, book
)
210 log('guessed lang as %s' % self
.lang
)
212 self
.toc_header
= get_metadata(self
.metadata
, 'toc_header', ns
=config
.FM
, default
=[None])[0]
213 if not self
.toc_header
:
214 self
.toc_header
= config
.SERVER_DEFAULTS
[server
]['toc_header']
216 self
.dir = str(get_metadata(self
.metadata
, 'dir', ns
=config
.FM
, default
=[None])[0])
218 self
.dir = guess_text_dir(server
, book
)
220 #Patch in the extra metadata. (lang and dir may be set from config)
221 #these should be read from zip -- so should go into zip?
222 for var
, key
, scheme
, ns
in (
223 (isbn
, 'id', 'ISBN', config
.DC
),
224 (license
, 'rights', 'License', config
.DC
),
225 (title
, 'title', '', config
.DC
),
226 (self
.lang
, 'language', '', config
.DC
),
227 (self
.dir, 'dir', '', config
.FM
),
230 add_metadata(self
.metadata
, key
, var
, scheme
=scheme
, ns
=ns
)
232 self
.isbn
= get_metadata(self
.metadata
, 'id', scheme
='ISBN', default
=[None])[0]
233 self
.license
= get_metadata(self
.metadata
, 'rights', scheme
='License', default
=[None])[0]
235 self
.toc
= self
.info
['TOC']
238 self
.workdir
= tempfile
.mkdtemp(prefix
=bookname
, dir=TMPDIR
)
239 os
.chmod(self
.workdir
, 0755)
241 self
.body_html_file
= self
.filepath('body.html')
242 self
.body_pdf_file
= self
.filepath('body.pdf')
243 self
.preamble_html_file
= self
.filepath('preamble.html')
244 self
.preamble_pdf_file
= self
.filepath('preamble.pdf')
245 self
.tail_html_file
= self
.filepath('tail.html')
246 self
.tail_pdf_file
= self
.filepath('tail.pdf')
247 self
.isbn_pdf_file
= None
248 self
.pdf_file
= self
.filepath('final.pdf')
249 self
.body_odt_file
= self
.filepath('body.odt')
250 self
.outline_file
= self
.filepath('outline.txt')
252 self
.publish_file
= os
.path
.abspath(os
.path
.join(config
.PUBLISH_DIR
, bookname
))
254 if page_settings
is not None:
255 self
.maker
= PageSettings(**page_settings
)
257 if title
is not None:
260 titles
= get_metadata(self
.metadata
, 'title')
262 self
.title
= titles
[0]
264 self
.title
= 'A Book About ' + self
.book
265 if isinstance(self
.title
, unicode):
266 self
.title
= self
.title
.encode('utf-8')
268 self
.notify_watcher()
271 if config
.TRY_BOOK_CLEANUP_ON_DEL
:
272 #Dont even define __del__ if it is not used.
273 _try_cleanup_on_del
= True
275 if self
._try
_cleanup
_on
_del
and os
.path
.exists(self
.workdir
):
276 self
._try
_cleanup
_on
_del
= False #or else you can get in bad cycles
279 def get_tree_by_id(self
, id):
280 """get an HTML tree from the given manifest ID"""
281 name
= self
.manifest
[id]['url']
282 mimetype
= self
.manifest
[id]['mimetype']
283 s
= self
.store
.read(name
)
285 if mimetype
== 'text/html':
287 tree
= lxml
.html
.parse(f
)
288 except etree
.XMLSyntaxError
, e
:
289 log('Could not parse html ID %r, filename %r, string %r... exception %s' %
290 (id, name
, s
[:20], e
))
291 tree
= lxml
.html
.document_fromstring('<html><body></body></html>').getroottree()
292 elif 'xml' in mimetype
: #XXX or is this just asking for trouble?
293 tree
= etree
.parse(f
)
299 def filepath(self
, fn
):
300 return os
.path
.join(self
.workdir
, fn
)
302 def save_tempfile(self
, fn
, data
):
303 """Save the data in a temporary directory that will be cleaned
304 up when all is done. Return the absolute file path."""
305 fn
= self
.filepath(fn
)
309 def make_oo_doc(self
):
310 """Make an openoffice document, using the html2odt script."""
312 html_text
= etree
.tostring(self
.tree
, method
="html")
313 save_data(self
.body_html_file
, html_text
)
314 run([config
.HTML2ODT
, self
.workdir
, self
.body_html_file
, self
.body_odt_file
])
315 log("Publishing %r as %r" % (self
.body_odt_file
, self
.publish_file
))
316 os
.rename(self
.body_odt_file
, self
.publish_file
)
317 self
.notify_watcher()
319 def extract_pdf_outline(self
):
320 """Get the outline (table of contents) for the PDF, which
321 wkhtmltopdf should have written to a file. If that file
322 doesn't exist (or config says not to use it), fall back to
323 using self._extract_pdf_outline_the_old_way, below.
325 if config
.USE_DUMP_OUTLINE
:
327 self
.outline_contents
, number_of_pages
= \
328 parse_extracted_outline(self
.outline_file
)
331 traceback
.print_exc()
332 number_of_pages
= self
._extract
_pdf
_outline
_the
_old
_way
()
334 number_of_pages
= self
._extract
_pdf
_outline
_the
_old
_way
()
336 self
.notify_watcher()
337 return number_of_pages
339 def _extract_pdf_outline_the_old_way(self
):
340 """Try to get the PDF outline using pdftk. This doesn't work
341 well with all scripts."""
342 debugf
= self
.filepath('extracted-outline.txt')
343 self
.outline_contents
, number_of_pages
= \
344 parse_outline(self
.body_pdf_file
, 1, debugf
)
346 if not self
.outline_contents
:
347 #probably problems with international text. need a horrible hack
348 log('no outline: trying again with ascii headings')
350 tree
= copy
.deepcopy(self
.tree
)
352 for tag
in ('h1', 'h2', 'h3', 'h4'):
353 for i
, e
in enumerate(tree
.getiterator(tag
)):
354 key
= "%s_%s" % (tag
, i
)
355 titlemap
[key
] = e
.text_content().strip(config
.WHITESPACE_AND_NULL
)
358 e
= lxml
.etree
.SubElement(e
, "strong", Class
="initial")
360 log("key: %r, text: %r, value: %r" %(key
, e
.text
, titlemap
[key
]))
362 ascii_html_file
= self
.filepath('body-ascii-headings.html')
363 ascii_pdf_file
= self
.filepath('body-ascii-headings.pdf')
364 html_text
= lxml
.etree
.tostring(tree
, method
="html")
365 save_data(ascii_html_file
, html_text
)
366 self
.maker
.make_raw_pdf(ascii_html_file
, ascii_pdf_file
, outline
=True)
367 debugf
= self
.filepath('ascii-extracted-outline.txt')
368 ascii_contents
, number_of_ascii_pages
= \
369 parse_outline(ascii_pdf_file
, 1, debugf
)
370 self
.outline_contents
= []
371 log ("number of pages: %s, post ascii: %s" %
372 (number_of_pages
, number_of_ascii_pages
))
373 for ascii_title
, depth
, pageno
in ascii_contents
:
374 if ascii_title
[-4:] == '�': #stupid [something] puts this in
375 ascii_title
= ascii_title
[:-4]
376 if ' ' in ascii_title
:
377 ascii_title
= ascii_title
.rsplit(' ', 1)[1]
378 title
= titlemap
.get(ascii_title
, '')
379 log((ascii_title
, title
, depth
, pageno
))
381 self
.outline_contents
.append((title
, depth
, pageno
))
383 return number_of_pages
385 def make_body_pdf(self
):
386 """Make a pdf of the HTML, using webkit"""
388 html_text
= etree
.tostring(self
.tree
, method
="html")
389 save_data(self
.body_html_file
, html_text
)
392 self
.maker
.make_raw_pdf(self
.body_html_file
, self
.body_pdf_file
, outline
=True, outline_file
=self
.outline_file
)
393 self
.notify_watcher('generate_pdf')
395 n_pages
= self
.extract_pdf_outline()
397 log ("found %s pages in pdf" % n_pages
)
398 #4. resize pages, shift gutters, even pages
399 self
.maker
.reshape_pdf(self
.body_pdf_file
, self
.dir, centre_end
=True)
400 self
.notify_watcher('reshape_pdf')
403 self
.maker
.number_pdf(self
.body_pdf_file
, n_pages
, dir=self
.dir,
404 numbers
=self
.page_numbers
)
405 self
.notify_watcher("number_pdf")
406 self
.notify_watcher()
408 def make_preamble_pdf(self
):
409 contents
= self
.make_contents()
410 inside_cover_html
= self
.compose_inside_cover()
411 log_types(self
.dir, self
.css_url
, self
.title
, inside_cover_html
,
412 self
.toc_header
, contents
, self
.title
)
414 html
= ('<html dir="%s"><head>\n'
415 '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n'
416 '<link rel="stylesheet" href="%s" />\n'
418 '<h1 class="frontpage">%s</h1>'
420 '<div class="contents"><h1>%s</h1>\n%s</div>\n'
421 '<div style="page-break-after: always; color:#fff" class="unseen">.'
422 '<!--%s--></div></body></html>'
423 ) % (self
.dir, self
.css_url
, self
.title
, inside_cover_html
,
424 self
.toc_header
, contents
, self
.title
)
425 save_data(self
.preamble_html_file
, html
)
427 self
.maker
.make_raw_pdf(self
.preamble_html_file
, self
.preamble_pdf_file
)
429 self
.maker
.reshape_pdf(self
.preamble_pdf_file
, self
.dir, centre_start
=True)
431 self
.maker
.number_pdf(self
.preamble_pdf_file
, None, dir=self
.dir,
432 numbers
=self
.preamble_page_numbers
,
435 self
.notify_watcher()
437 def make_end_matter_pdf(self
):
438 """Make an inside back cover and a back cover. If there is an
439 isbn number its barcode will be put on the back cover."""
441 self
.isbn_pdf_file
= self
.filepath('isbn.pdf')
442 self
.maker
.make_barcode_pdf(self
.isbn
, self
.isbn_pdf_file
)
443 self
.notify_watcher('make_barcode_pdf')
445 end_matter
= self
.compose_end_matter()
447 save_data(self
.tail_html_file
, end_matter
.decode('utf-8'))
448 self
.maker
.make_raw_pdf(self
.tail_html_file
, self
.tail_pdf_file
)
450 self
.maker
.reshape_pdf(self
.tail_pdf_file
, self
.dir, centre_start
=True,
451 centre_end
=True, even_pages
=False)
452 self
.notify_watcher()
454 def make_book_pdf(self
):
455 """A convenient wrapper of a few necessary steps"""
456 # now the Xvfb server is needed. make sure it has had long enough to get going
459 self
.make_preamble_pdf()
460 self
.make_end_matter_pdf()
462 concat_pdfs(self
.pdf_file
, self
.preamble_pdf_file
,
463 self
.body_pdf_file
, self
.tail_pdf_file
,
466 self
.notify_watcher('concatenated_pdfs')
468 def make_templated_html(self
, template
=None, zip=False, index
=config
.TEMPLATING_INDEX_FIRST
):
469 """Make a templated html version of the book."""
470 #set up the directory and static files
472 destdir
= self
.filepath('html')
474 os
.rename(self
.filepath('static'), self
.filepath(os
.path
.join(destdir
, 'static')))
477 template_tree
= lxml
.html
.parse(config
.TEMPLATING_DEFAULT_TEMPLATE
).getroot()
479 template_tree
= lxml
.html
.document_fromstring(template
)
481 tocmap
= filename_toc_map(self
.toc
)
482 contents_name
, first_name
= config
.TEMPLATING_INDEX_MODES
[index
]
484 #build a contents page and a contents menu
485 #We can't make this in the same pass because the menu needs to
486 #go in every page (i.e., into the template)
487 menu
= etree
.Element('ul', Class
=config
.TEMPLATING_MENU_ELEMENT
)
488 contents
= etree
.Element('div', Class
=config
.TEMPLATING_REPLACED_ELEMENT
)
490 booktitle
= etree
.Element('div', Class
=config
.TEMPLATING_BOOK_TITLE_ELEMENT
)
492 booktitle
.text
= self
.title
.decode('utf-8')
494 etree
.SubElement(contents
, 'h1').text
= self
.title
.decode('utf-8')
496 savename
= first_name
497 for ID
in self
.spine
:
498 filename
= self
.manifest
[ID
]['url']
499 #handle any TOC points in this file.
500 for point
in tocmap
[filename
]:
501 if point
['type'] == 'booki-section':
502 etree
.SubElement(contents
, 'h2').text
= point
['title']
503 etree
.SubElement(menu
, 'li', Class
='booki-section').text
= point
['title']
507 div
= etree
.SubElement(contents
, 'div')
508 etree
.SubElement(div
, 'a', href
=savename
).text
= point
['title']
509 li
= etree
.SubElement(menu
, 'li')
511 etree
.SubElement(li
, 'a', href
=savename
).text
= point
['title']
513 #put the menu and book title into the template (if it wants it)
514 for e
in template_tree
.iterdescendants(config
.TEMPLATING_MENU_ELEMENT
):
515 e
.getparent().replace(e
, copy
.deepcopy(menu
))
516 for e
in template_tree
.iterdescendants(config
.TEMPLATING_BOOK_TITLE_ELEMENT
):
517 e
.getparent().replace(e
, copy
.deepcopy(booktitle
))
519 #function to template content and write to disk
520 def save_content(content
, title
, filename
):
521 if not isinstance(title
, unicode):
522 title
= title
.decode('utf-8')
523 content
.set('id', config
.TEMPLATING_CONTENTS_ID
)
525 dest
= copy
.deepcopy(template_tree
)
526 dest
.set('dir', self
.dir)
527 for e
in dest
.iterdescendants(config
.TEMPLATING_REPLACED_ELEMENT
):
528 #copy only if there are more than 2
529 if content
.getparent() is not None:
530 content
= copy
.deepcopy(content
)
531 e
.getparent().replace(e
, content
)
533 chaptertitle
= etree
.Element('div', Class
=config
.TEMPLATING_CHAPTER_TITLE_ELEMENT
)
534 chaptertitle
.text
= title
535 for e
in template_tree
.iterdescendants(config
.TEMPLATING_CHAPTER_TITLE_ELEMENT
):
536 e
.getparent().replace(e
, copy
.deepcopy(chaptertitle
))
537 for e
in dest
.iterdescendants('title'):
538 #log(type(title), title)
540 self
.save_tempfile(os
.path
.join(destdir
, filename
), lxml
.html
.tostring(dest
))
543 #write the contents to a file. (either index.html or contents.html)
544 save_content(contents
, self
.title
, contents_name
)
546 savename
= first_name
547 #and now write each chapter to a file
548 for ID
in self
.spine
:
549 filename
= self
.manifest
[ID
]['url']
551 root
= self
.get_tree_by_id(ID
).getroot()
552 body
= root
.find('body')
554 log("hit %s when trying book.get_tree_by_id(%s).getroot().find('body')" % (e
, ID
))
555 body
= etree
.Element('body')
557 #handle any TOC points in this file. There should only be one!
558 for point
in tocmap
[filename
]:
559 if point
['type'] != 'booki-section':
560 title
= point
['title']
567 save_content(body
, title
, savename
)
569 log(destdir
, self
.publish_file
)
570 os
.rename(destdir
, self
.publish_file
)
571 self
.notify_watcher()
574 def make_simple_pdf(self
, mode
):
575 """Make a simple pdf document without contents or separate
576 title page. This is used for multicolumn newspapers and for
577 web-destined pdfs."""
579 #0. Add heading to begining of html
580 body
= list(self
.tree
.cssselect('body'))[0]
581 e
= body
.makeelement('h1', {'id': 'book-title'})
582 e
.text
= self
.title
.decode('utf-8')
584 intro
= lxml
.html
.fragment_fromstring(self
.compose_inside_cover())
587 #0.5 adjust parameters to suit the particular kind of output
589 self
.maker
.gutter
= 0
592 html_text
= etree
.tostring(self
.tree
, method
="html")
593 save_data(self
.body_html_file
, html_text
)
595 #2. Make a pdf of it (direct to to final pdf)
596 self
.maker
.make_raw_pdf(self
.body_html_file
, self
.pdf_file
, outline
=True, outline_file
=self
.outline_file
)
597 self
.notify_watcher('generate_pdf')
598 n_pages
= count_pdf_pages(self
.pdf_file
)
601 #3. resize pages and shift gutters.
602 self
.maker
.reshape_pdf(self
.pdf_file
, self
.dir, centre_end
=True)
603 self
.notify_watcher('reshape_pdf')
606 self
.maker
.number_pdf(self
.pdf_file
, n_pages
,
607 dir=self
.dir, numbers
=self
.page_numbers
)
608 self
.notify_watcher("number_pdf")
609 self
.notify_watcher()
613 """Rotate the pdf 180 degrees so an RTL book can print on LTR
615 rotated
= self
.filepath('final-rotate.pdf')
616 unrotated
= self
.filepath('final-pre-rotate.pdf')
617 #leave the unrotated pdf intact at first, in case of error.
618 rotate_pdf(self
.pdf_file
, rotated
)
619 os
.rename(self
.pdf_file
, unrotated
)
620 os
.rename(rotated
, self
.pdf_file
)
621 self
.notify_watcher()
623 def publish_pdf(self
):
624 """Move the finished PDF to its final resting place"""
625 log("Publishing %r as %r" % (self
.pdf_file
, self
.publish_file
))
626 os
.rename(self
.pdf_file
, self
.publish_file
)
627 self
.notify_watcher()
629 def publish_bookizip(self
):
630 """Publish the bookizip. For this, copy rather than move,
631 because the bookizip might be used by further processing. If
632 possible, a hard link is created."""
633 log("Publishing %r as %r" % (self
.bookizip_file
, self
.publish_file
))
635 run(['cp', '-l', self
.bookizip_file
, self
.publish_file
])
637 run(['cp', self
.bookizip_file
, self
.publish_file
])
638 self
.notify_watcher()
640 def concat_html(self
):
641 """Join all the chapters together into one tree. Keep the TOC
642 up-to-date along the way."""
644 #each manifest item looks like:
650 doc
= lxml
.html
.document_fromstring('<html dir="%s"><body dir="%s"></body></html>'
651 % (self
.dir, self
.dir))
652 tocmap
= filename_toc_map(self
.toc
)
653 for ID
in self
.spine
:
654 details
= self
.manifest
[ID
]
655 #log(ID, pformat(details))
658 root
= self
.get_tree_by_id(ID
).getroot()
660 log("hit %s when trying book.get_tree_by_id(%s).getroot()" % (e
, ID
))
662 #handle any TOC points in this file
663 for point
in tocmap
[details
['url']]:
664 #if the url has a #identifier, use it. Otherwise, make
665 #one up, using a hidden element at the beginning of
666 #the inserted document.
667 #XXX this will break if different files use the same ids
668 #XXX should either replace all, or replace selectively.
669 if point
['fragment']:
670 fragment
= point
['fragment']
672 body
= _find_tag(root
, 'body')
673 fragment
= '%s_%s' % (self
.cookie
, point
['index'])
674 #reuse first tag if it is suitable.
676 body
[0].tag
in ('h1', 'h2', 'h3', 'h4', 'p', 'div')):
677 if body
[0].get('id') is None:
678 body
[0].set('id', fragment
)
680 fragment
= body
[0].get('id')
681 #the chapter starts with a heading. that heading should be the chapter name.
682 if body
[0].tag
in ('h1', 'h2', 'h3'):
683 #log('chapter has title "%s", found html title "%s"' %
684 # (point['title'], body[0].text_content()))
685 point
['html_title'] = body
[0].text_content()
687 marker
= body
.makeelement('div', style
="display:none",
689 body
.insert(0, marker
)
690 point
['html_id'] = fragment
695 def unpack_static(self
):
696 """Extract static files from the zip for the html to refer to."""
697 static_files
= [x
['url'] for x
in self
.manifest
.values()
698 if x
['url'].startswith('static')]
700 os
.mkdir(self
.filepath('static'))
702 for name
in static_files
:
703 s
= self
.store
.read(name
)
704 f
= open(self
.filepath(name
), 'w')
707 self
.notify_watcher()
711 #XXX concatenate the HTML to match how TWiki version worked.
712 # This is perhaps foolishly early -- throwing away useful boundaries.
714 self
.tree
= self
.concat_html()
715 self
.save_tempfile('raw.html', etree
.tostring(self
.tree
, method
='html'))
717 self
.headings
= [x
for x
in self
.tree
.cssselect('h1')]
719 self
.headings
[0].set('class', "first-heading")
720 for h1
in self
.headings
:
721 h1
.title
= h1
.text_content().strip()
722 self
.notify_watcher()
724 def make_contents(self
):
725 """Generate HTML containing the table of contents. This can
726 only be done after the main PDF has been made, because the
727 page numbers are contained in the PDF outline."""
728 header
= '<table class="toc">\n'
729 row_tmpl
= ('<tr><td class="chapter">%s</td><td class="title">%s</td>'
730 '<td class="pagenumber">%s</td></tr>\n')
731 empty_section_tmpl
= ('<tr><td class="empty-section" colspan="3">%s</td></tr>\n')
732 section_tmpl
= ('<tr><td class="section" colspan="3">%s</td></tr>\n')
733 footer
= '\n</table>'
739 #log(self.outline_contents)
740 outline_contents
= iter(self
.outline_contents
)
742 for section
in self
.toc
:
743 if not section
.get('children'):
744 contents
.append(empty_section_tmpl
% section
['title'])
746 contents
.append(section_tmpl
% section
['title'])
748 for point
in section
['children']:
752 h1_text
, level
, page_num
= outline_contents
.next()
753 except StopIteration:
754 log("contents data not found for %s. Stopping" % (point
,))
756 contents
.append(row_tmpl
% (chapter
, _get_best_title(point
), page_num
))
759 doc
= header
+ '\n'.join(contents
) + footer
760 if isinstance(doc
, unicode):
761 doc
= doc
.encode('utf-8')
762 self
.notify_watcher()
765 def add_section_titles(self
):
766 """Add any section heading pages that the TOC.txt file
767 specifies. These are sub-book, super-chapter groupings.
769 Also add initial numbers to chapters.
775 #only top level sections get a subsection page,
776 #and only if they have children.
777 if t
.get('children'):
778 section
= self
.tree
.makeelement('div', Class
="objavi-subsection")
779 heading
= etree
.SubElement(section
, 'div', Class
="objavi-subsection-heading")
780 heading
.text
= t
['title']
781 for child
in t
['children']:
782 item
= etree
.SubElement(section
, 'div', Class
="objavi-chapter")
783 if 'html_title' in child
:
784 item
.text
= child
['html_title']
785 heading
= self
.tree
.cssselect('#'+ child
['html_id'])
787 _add_initial_number(heading
[0], chapter
)
789 item
.text
= child
['title']
790 _add_initial_number(item
, chapter
)
791 log(item
.text
, debug
='HTMLGEN')
793 log("#%s is %s" % (t
['html_id'], self
.tree
.cssselect('#'+ t
['html_id'])))
794 location
= self
.tree
.cssselect('#'+ t
['html_id'])[0]
795 location
.addprevious(section
)
798 self
.notify_watcher()
801 def add_css(self
, css
=None, mode
='book'):
802 """If css looks like a url, use it as a stylesheet link.
803 Otherwise it is the CSS itself, which is saved to a temporary file
805 log("css is %r" % css
)
807 if css
is None or not css
.strip():
808 css_default
= config
.SERVER_DEFAULTS
[self
.server
]['css-%s' % mode
]
809 if css_default
is None:
810 #guess from language -- this should come first
811 css_modes
= config
.LANGUAGE_CSS
.get(self
.lang
,
812 config
.LANGUAGE_CSS
['en'])
813 css_default
= css_modes
.get(mode
, css_modes
[None])
815 elif not re
.match(r
'^http://\S+$', css
):
816 url
= path2url(self
.save_tempfile('objavi.css', css
), full
=True)
820 #find the head -- it's probably first child but lets not assume.
821 for child
in htmltree
:
822 if child
.tag
== 'head':
826 head
= htmltree
.makeelement('head')
827 htmltree
.insert(0, head
)
829 link
= etree
.SubElement(head
, 'link', rel
='stylesheet', type='text/css', href
=url
)
831 self
.notify_watcher()
835 def _read_localised_template(self
, template
, fallbacks
=['en']):
836 """Try to get the template in the approriate language, otherwise in english."""
837 for lang
in [self
.lang
] + fallbacks
:
839 fn
= template
% (lang
)
843 log("couldn't open inside front cover for lang %s (filename %s)" % (lang
, fn
))
849 def compose_inside_cover(self
):
850 """create the markup for the preamble inside cover."""
851 template
= self
._read
_localised
_template
(config
.INSIDE_FRONT_COVER_TEMPLATE
)
854 isbn_text
= '<b>ISBN :</b> %s <br>' % self
.isbn
858 return template
% {'date': time
.strftime('%Y-%m-%d'),
860 'license': self
.license
,
864 def compose_end_matter(self
):
865 """create the markup for the end_matter inside cover. If
866 self.isbn is not set, the html will result in a pdf that
867 spills onto two pages.
869 template
= self
._read
_localised
_template
(config
.END_MATTER_TEMPLATE
)
871 d
= {'css_url': self
.css_url
,
876 d
['inside_cover_style'] = ''
878 d
['inside_cover_style'] = 'page-break-after: always'
883 def make_epub(self
, use_cache
=False):
884 """Make an epub version of the book, using Mike McCabe's
885 epub module for the Internet Archive."""
886 ebook
= ia_epub
.Book(self
.publish_file
, content_dir
='')
887 def add_file(ID
, filename
, mediatype
, content
):
888 ebook
.add_content({'media-type': mediatype
.encode('utf-8'),
889 'id': ID
.encode('utf-8'),
890 'href': filename
.encode('utf-8'),
893 toc
= self
.info
['TOC']
896 filemap
= {} #map html to corresponding xhtml
897 spinemap
= {} #map IDs to multi-file chapters
898 for ID
in self
.manifest
:
899 details
= self
.manifest
[ID
]
900 #log(ID, pformat(details))
901 fn
, mediatype
= details
['url'], details
['mimetype']
902 content
= self
.store
.read(fn
)
903 if mediatype
== 'text/html':
904 #convert to application/xhtml+xml, and perhaps split
905 c
= EpubChapter(self
.server
, self
.book
, ID
, content
,
908 if fn
[-5:] == '.html':
912 fnx
= fnbase
+ '.xhtml'
913 mediatype
= 'application/xhtml+xml'
915 fragments
= split_html(c
.as_xhtml(),
916 compressed_size
=self
.store
.getinfo(fn
).compress_size
)
918 #add the first one as if it is the whole thing (as it often is)
919 add_file(ID
, fnx
, mediatype
, fragments
[0])
921 if len(fragments
) > 1:
923 spinemap
[ID
] = spine_ids
925 for i
in range(1, len(fragments
)):
926 # XXX it is possible for duplicates if another
927 # file happens to have this name. Ignore for now
928 _id
= '%s_SONY_WORKAROUND_%s' % (ID
, i
)
929 spine_ids
.append(_id
)
931 '%s_SONY_WORKAROUND_%s.xhtml' % (fnbase
, i
),
932 mediatype
, fragments
[i
])
935 add_file(ID
, fn
, mediatype
, content
)
938 ncx
= epub_utils
.make_ncx(toc
, self
.metadata
, filemap
)
939 ebook
.add(ebook
.content_dir
+ 'toc.ncx', ncx
)
942 for ID
in self
.spine
:
944 for x
in spinemap
[ID
]:
945 ebook
.add_spine_item({'idref': x
})
947 ebook
.add_spine_item({'idref': ID
})
949 #metadata -- no use of attributes (yet)
950 # and fm: metadata disappears for now
954 for ns
, namespace
in self
.metadata
.items():
955 for keyword
, schemes
in namespace
.items():
957 keyword
= '{%s}%s' % (ns
, keyword
)
958 for scheme
, values
in schemes
.items():
965 if keyword
in (DCNS
+ 'creator', DCNS
+ 'contributor'):
966 item
['atts'] = {'role': scheme
}
968 item
['atts'] = {'scheme': scheme
}
970 has_authors
= 'creator' in self
.metadata
[DC
]
971 if not has_authors
and config
.CLAIM_UNAUTHORED
:
973 for x
in self
.metadata
[DC
]['creator'].values():
976 meta_info_items
.append({'item': DCNS
+ 'creator',
977 'text': 'The Contributors'})
979 meta_info_items
.append({'item': DCNS
+ 'rights',
980 'text': 'This book is free. Copyright %s' % (', '.join(authors
))}
983 tree_str
= ia_epub
.make_opf(meta_info_items
,
984 ebook
.manifest_items
,
988 ebook
.add(ebook
.content_dir
+ 'content.opf', tree_str
)
990 self
.notify_watcher()
993 def publish_s3(self
):
994 """Push the book's epub to archive.org, using S3."""
997 for x
in ('S3_SECRET', 'S3_ACCESSKEY'):
998 fn
= getattr(config
, x
)
1000 secrets
[x
] = f
.read().strip()
1003 now
= time
.strftime('%F')
1004 s3output
= self
.filepath('s3-output.txt')
1005 s3url
, detailsurl
= find_archive_urls(self
.book
, self
.bookname
)
1007 'x-amz-auto-make-bucket:1',
1008 "authorization: LOW %(S3_ACCESSKEY)s:%(S3_SECRET)s" % secrets
,
1009 'x-archive-meta-mediatype:texts',
1010 'x-archive-meta-collection:opensource',
1011 'x-archive-meta-title:%s' % (self
.book
,),
1012 'x-archive-meta-date:%s' % (now
,),
1013 'x-archive-meta-creator:FLOSS Manuals Contributors',
1016 if self
.license
in config
.LICENSES
:
1017 headers
.append('x-archive-meta-licenseurl:%s' % config
.LICENSES
[self
.license
])
1019 argv
= ['curl', '--location', '-s', '-o', s3output
]
1021 argv
.extend(('--header', h
))
1022 argv
.extend(('--upload-file', self
.publish_file
, s3url
,))
1024 log(' '.join(repr(x
) for x
in argv
))
1025 check_call(argv
, stdout
=sys
.stderr
)
1026 self
.notify_watcher()
1027 return detailsurl
, s3url
1029 def publish_shared(self
, group
=None, user
=None):
1030 """Make symlinks from the BOOKI_SHARED_DIRECTORY to the
1031 published file, so that a virtual host can be set up to
1032 publish the files from a static location. If group is set, it
1033 is used as a subdirectory, otherwise a virtual group like
1034 'lonely-user-XXX' is used."""
1038 group
= config
.BOOKI_SHARED_LONELY_USER_PREFIX
+ user
1039 group
= group
.replace('..', '+').replace('/', '+')
1040 group
= re
.sub("[^\w%.,-]+", "_", group
)[:250]
1041 groupdir
= os
.path
.join(config
.BOOKI_SHARED_DIRECTORY
, group
)
1043 generic_name
= re
.sub(r
'-\d{4}\.\d\d\.\d\d\-\d\d\.\d\d\.\d\d', '', self
.bookname
)
1044 log(self
.bookname
, generic_name
)
1046 if not os
.path
.exists(groupdir
):
1049 #change directory, for least symlink confusion
1052 if os
.path
.exists(generic_name
):
1053 os
.unlink(generic_name
)
1054 os
.symlink(os
.path
.abspath(self
.publish_file
), generic_name
)
1059 """Start an Xvfb instance, using a new server number. A
1060 reference to it is stored in self.xvfb, which is used to kill
1061 it when the pdf is done.
1063 Note that Xvfb doesn't interact well with dbus which is
1064 present on modern desktops.
1066 #Find an unused server number (in case two cgis are running at once)
1068 servernum
= random
.randrange(50, 500)
1069 if not os
.path
.exists('/tmp/.X%s-lock' % servernum
):
1072 self
.xserver_no
= ':%s' % servernum
1074 authfile
= self
.filepath('Xauthority')
1075 os
.environ
['XAUTHORITY'] = authfile
1077 #mcookie(1) eats into /dev/random, so avoid that
1078 from hashlib
import md5
1079 m
= md5("%r %r %r %r %r" % (self
, os
.environ
, os
.getpid(), time
.time(), os
.urandom(32)))
1080 mcookie
= m
.hexdigest()
1082 check_call(['xauth', 'add', self
.xserver_no
, '.', mcookie
])
1084 self
.xvfb
= Popen(['Xvfb', self
.xserver_no
,
1085 '-screen', '0', '1024x768x24',
1087 #'-blackpixel', '0',
1088 #'-whitepixel', str(2 ** 24 -1),
1089 #'+extension', 'Composite',
1095 # We need to wait a bit before the Xvfb is ready. but the
1096 # downloads are so slow that that probably doesn't matter
1098 self
.xvfb_ready_time
= time
.time() + 2
1100 os
.environ
['DISPLAY'] = self
.xserver_no
1101 log(self
.xserver_no
)
1103 def wait_for_xvfb(self
):
1104 """wait until a previously set time before continuing. This
1105 is so Xvfb has time to properly start."""
1106 if hasattr(self
, 'xvfb'):
1107 d
= self
.xvfb_ready_time
- time
.time()
1110 self
.notify_watcher()
1112 def cleanup_x(self
):
1113 """Try very hard to kill off Xvfb. In addition to killing
1114 this instance's xvfb, occasionally (randomly) search for
1115 escaped Xvfb instances and kill those too."""
1116 if not hasattr(self
, 'xvfb'):
1118 check_call(['xauth', 'remove', self
.xserver_no
])
1120 log("trying to kill Xvfb %s" % p
.pid
)
1123 if p
.poll() is not None:
1124 log("%s died with %s" % (p
.pid
, p
.poll()))
1126 log("%s not dead yet" % p
.pid
)
1129 log("Xvfb would not die! kill -9! kill -9!")
1135 if random
.random() < 0.1:
1136 # occasionally kill old xvfbs and soffices, if there are any.
1137 self
.kill_old_processes()
1139 def kill_old_processes(self
):
1140 """Sometimes, despite everything, Xvfb or soffice instances
1141 hang around well after they are wanted -- for example if the
1142 cgi process dies particularly badly. So kill them if they have
1143 been running for a long time."""
1144 log("running kill_old_processes")
1145 killable_names
= ' '.join(['Xvfb', 'soffice', 'soffice.bin', 'ooffice',
1146 os
.path
.basename(config
.HTML2ODT
),
1147 os
.path
.basename(config
.WKHTMLTOPDF
),
1149 p
= Popen(['ps', '-C', killable_names
,
1150 '-o', 'pid,etime', '--no-headers'], stdout
=PIPE
)
1151 data
= p
.communicate()[0].strip()
1153 lines
= data
.split('\n')
1156 log('dealing with ps output "%s"' % line
)
1158 pid
, days
, hours
, minutes
, seconds \
1159 = re
.match(r
'^\s*(\d+)\s+(\d+-)?(\d{2})?:?(\d{2}):(\d+)\s*$', line
).groups()
1160 except AttributeError:
1161 log("Couldn't parse that line!")
1162 # 50 minutes should be enough xvfb time for anyone
1163 if days
or hours
or int(minutes
) > 50:
1165 log("going to kill pid %s" % pid
)
1171 #try again in case any are lingerers
1173 os
.kill(int(pid
), 9)
1175 log('PID %s seems dead (re-kill gives %s)' % (pid
, e
))
1177 log('killing %s with -9' % pid
)
1178 self
.notify_watcher()
1182 if not config
.KEEP_TEMP_FILES
:
1183 for fn
in os
.listdir(self
.workdir
):
1184 os
.remove(os
.path
.join(self
.workdir
, fn
))
1185 os
.rmdir(self
.workdir
)
1187 log("NOT removing '%s', containing the following files:" % self
.workdir
)
1188 log(*os
.listdir(self
.workdir
))
1190 self
.notify_watcher()
1194 return (os
.environ
.get('HTTP_HOST') in config
.USE_ZIP_CACHE_ALWAYS_HOSTS
)
1196 def _read_cached_zip(server
, book
, max_age
):
1197 #find a recent zip if possible
1198 prefix
= '%s/%s' % (config
.BOOKI_BOOK_DIR
, make_book_name(book
, server
, '').split('-20', 1)[0])
1199 from glob
import glob
1200 zips
= sorted(glob(prefix
+ '*.zip'))
1202 log("no cached booki-zips matching %s*.zip" % (prefix
,))
1205 cutoff
= time
.time() - max_age
* 60
1208 date
= time
.mktime(time
.strptime(zipname
, prefix
+ '-%Y.%m.%d-%H.%M.%S.zip'))
1213 return blob
, zipname
1214 log("%s is too old, must reload" % zipname
)
1216 except (IOError, IndexError, ValueError), e
:
1217 log('could not make sense of %s: got exception %s' % (zipname
, e
))
1221 def fetch_zip(server
, book
, save
=False, max_age
=-1, filename
=None):
1222 interface
= config
.SERVER_DEFAULTS
[server
].get('interface', 'Booki')
1224 url
= config
.ZIP_URLS
[interface
] % {'HTTP_HOST': HTTP_HOST
,
1225 'server': server
, 'book':book
}
1227 raise NotImplementedError("Can't handle '%s' interface" % interface
)
1229 if use_cache() and max_age
< 0:
1230 #default to 12 hours cache on objavi.halo.gen.nz
1234 log('WARNING: trying to use cached booki-zip',
1235 'If you are debugging booki-zip creation, you will go CRAZY'
1236 ' unless you switch this off')
1237 blob_and_name
= _read_cached_zip(server
, book
, max_age
)
1238 if blob_and_name
is not None:
1239 return blob_and_name
1241 log('fetching zip from %s'% url
)
1246 if filename
is None:
1247 filename
= '%s/%s' % (config
.BOOKI_BOOK_DIR
,
1248 make_book_name(book
, server
, '.zip'))
1249 f
= open(filename
, 'w')
1252 return blob
, filename
1255 def split_html(html
, compressed_size
=None, fix_markup
=False):
1256 """Split long html files into pieces that will work nicely on a
1258 if compressed_size
is None:
1260 compressed_size
= len(zlib
.compress(html
))
1262 splits
= max(compressed_size
// config
.EPUB_COMPRESSED_SIZE_MAX
,
1263 len(html
) // config
.EPUB_FILE_SIZE_MAX
)
1264 log("uncompressed: %s, compressed: %s, splits: %s" % (len(html
), compressed_size
, splits
))
1270 #remove '<' in attributes etc, which makes the marker
1271 #insertion more reliable
1272 html
= etree
.tostring(lxml
.html
.fromstring(html
),
1277 target
= len(html
) // (splits
+ 1)
1280 for i
in range(splits
):
1281 e
= html
.find('<', target
* (i
+ 1))
1282 fragments
.append(html
[s
:e
])
1283 fragments
.append('<hr class="%s" id="split_%s" />' % (config
.MARKER_CLASS_SPLIT
, i
))
1285 fragments
.append(html
[s
:])
1287 #XXX somehow try to avoid split in silly places (e.g, before inline elements)
1288 chapters
= split_tree(lxml
.html
.fromstring(''.join(fragments
)))
1289 return [etree
.tostring(c
.tree
, encoding
='UTF-8', method
='html') for c
in chapters
]