espri.cgi

   1 #!/usr/bin/python
   2 #
   3 # Part of the Objavi2 package.  This script imports e-books into Booki
   4 #
   5 # Copyright (C) 2009 Douglas Bagnall
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with this program; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20
  21 import os, sys
  22 import time
  23 from urllib2 import urlopen, URLError
  24 from urllib import urlencode, unquote
  25 from urlparse import urlsplit
  26 import traceback, tempfile
  27 from subprocess import check_call, CalledProcessError
  28
  29 from objavi import epub
  30 from objavi.book_utils import log
  31 from objavi.cgi_utils import output_blob_and_exit, parse_args, print_template_and_exit, output_blob_and_shut_up
  32 from objavi.cgi_utils import is_utf8, is_url, super_bleach
  33 from objavi import config
  34
  35 IA_EPUB_URL = "http://www.archive.org/download/%s/%s.epub"
  36
  37 def print_form_and_exit(booklink):
  38     print_template_and_exit('templates/espri.html',
  39                             {'booklink': booklink, }
  40                             )
  41
  42 def async_start(content, mimetype):
  43     """Begin (and in many cases, finish) http output.
  44     In asynchronous modes, fork and close down stdout.
  45     """
  46     output_blob_and_shut_up(content, mimetype)
  47     log(sys.stdout, sys.stderr, sys.stdin)
  48     if os.fork():
  49         os._exit(0)
  50     sys.stdout.close()
  51     sys.stdin.close()
  52     #log(sys.stdout, sys.stderr, sys.stdin)
  53
  54
  55 def async_callback(callback_url, **kwargs):
  56     """Call the callback url with each message."""
  57     pid = os.fork()
  58     if pid:
  59         log('child %s is doing callback with message %r' % (pid, kwargs, ))
  60         return
  61     data = urlencode(kwargs)
  62     try:
  63         f = urlopen(callback_url, data)
  64         time.sleep(2)
  65         f.close()
  66     except URLError, e:
  67         traceback.print_exc()
  68         log("ERROR in callback:\n %r\n %s %s" % (e.url, e.code, e.msg))
  69     os._exit(0)
  70
  71
  72 def espri(epuburl, zipurl):
  73     log(epuburl, zipurl)
  74     f = urlopen(epuburl)
  75     s = f.read()
  76     f.close()
  77     e = epub.Epub()
  78     e.load(s)
  79     e.parse_meta()
  80     e.parse_opf()
  81     e.parse_ncx()
  82     e.make_bookizip(zipurl)
  83
  84 def ia_espri(book_id):
  85     epuburl = IA_EPUB_URL % (book_id, book_id)
  86     log(epuburl)
  87     zipurl = '%s/%s.zip' % (config.BOOKI_BOOK_DIR, book_id)
  88     espri(epuburl, zipurl)
  89     return zipurl
  90
  91 def inet_espri(epuburl):
  92     tainted_name = unquote(os.path.basename(urlsplit(epuburl).path))
  93     filename = super_bleach(tainted_name)
  94     if filename.lower().endswith('-epub'):
  95         filename = filename[:-5]
  96     zipurl = '%s/%s-%s.zip' % (config.BOOKI_BOOK_DIR, filename, time.strftime('%F_%T'))
  97     espri(epuburl, zipurl)
  98     return zipurl
  99
 100
 101 TIMEOUT_CMD = 'timeout'
 102 WIKIBOOKS_TIMEOUT = '600'
 103 WIKIBOOKS_CMD = 'wikibooks2epub'
 104 WIKIBOOKS_CACHE = 'cache/wikibooks'
 105
 106 class TimeoutError(Exception):
 107     pass
 108
 109 def wikibooks_espri(wiki_url):
 110     """Wikibooks import using the wikibooks2epub script by Jan Gerber
 111     to first convert the wikibook to an epub, which can then be turned
 112     into a bookizip via the espri function.
 113     """
 114     os.environ['oxCACHE'] = WIKIBOOKS_CACHE
 115     tainted_name = unquote(os.path.basename(urlsplit(wiki_url).path))
 116     filename = "%s-%s" % (super_bleach(tainted_name),
 117                           time.strftime('%Y.%m.%d-%H.%M.%S'))
 118     workdir = tempfile.mkdtemp(prefix=filename, dir=config.TMPDIR)
 119     os.chmod(workdir, 0755)
 120     epub_file = os.path.join(workdir, filename + '.epub')
 121     epub_url = 'file://' + os.path.abspath(epub_file)
 122     #epub_url = 'http://localhost/' + epub_file
 123
 124     #the wikibooks importer is a separate process, so run that, then collect the epub.
 125     cmd = [TIMEOUT_CMD, WIKIBOOKS_TIMEOUT,
 126            WIKIBOOKS_CMD,
 127            '-i', wiki_url,
 128            '-o', epub_file
 129            ]
 130     log(cmd)
 131
 132     try:
 133         check_call(cmd)
 134     except CalledProcessError, e:
 135         if e.returncode == 137:
 136             raise TimeoutError('Wikibooks took too long (over %s seconds)' % WIKIBOOKS_TIMEOUT)
 137         raise
 138
 139     zipurl = '%s/%s.zip' % (config.BOOKI_BOOK_DIR, filename)
 140     espri(epub_url, zipurl)
 141     return zipurl
 142
 143
 144
 145
 146 SOURCES = {
 147     'archive.org': {'function': ia_espri},
 148     'url': {'function': inet_espri},
 149     'wikibooks': {'function': wikibooks_espri},
 150 }
 151 ARG_VALIDATORS = {
 152     "source": SOURCES.__contains__,
 153     "book": is_utf8,
 154     "url": is_url,  #obsolete
 155     'mode': ('zip', 'html', 'callback').__contains__,
 156     'callback': is_url,
 157 }
 158
 159 def ensure_backwards_compatibility(args):
 160     """Mutate args to match previous API"""
 161     if 'url' in args:
 162         args['source'] = 'url'
 163         args['book'] = args['url']
 164     if 'source' not in args:
 165         args['source'] = 'archive.org'
 166     if 'callback' in args and 'mode' not in args:
 167         args['mode'] = 'callback'
 168
 169
 170 if __name__ == '__main__':
 171     args = parse_args(ARG_VALIDATORS)
 172     ensure_backwards_compatibility(args)
 173     mode = args.get('mode', 'html')
 174     book = args.get('book')
 175     source = args.get('source', 'archive.org')
 176     source_fn = SOURCES.get(source)['function']
 177
 178     if mode == 'callback':
 179         callback_url = args['callback']
 180         async_start('OK, got it...  will call %r when done' % (callback_url,),
 181                     'text/plain')
 182     url = None
 183     if book is not None:
 184         try:
 185             url = source_fn(book)
 186             book_link = '<p>Download <a href="%s">%s</a>.</p>' % (url, url)
 187         except Exception, e:
 188             traceback.print_exc()
 189             log(e, args)
 190             book_link = '<p>Error: <b>%s</b> when trying to get <b>%s</b></p>' % (e, book)
 191             if mode != 'html':
 192                 raise
 193     else:
 194         book_link = ''
 195
 196     if mode == 'callback':
 197         async_callback(callback_url, url=url)
 198
 199     elif mode == 'zip' and url is not None:
 200         f = open(url)
 201         data = f.read()
 202         f.close()
 203         output_blob_and_exit(data, config.BOOKIZIP_MIMETYPE,
 204                              os.path.basename(url))
 205     else:
 206         log(book_link)
 207         print_form_and_exit(book_link)
 208     log('done!')