src/gpodder/download.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # gPodder - A media aggregator and podcast client
   4 # Copyright (c) 2005-2009 Thomas Perl and the gPodder Team
   5 #
   6 # gPodder is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # gPodder is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20
  21 #
  22 #  download.py -- Download client using DownloadStatusManager
  23 #  Thomas Perl <thp@perli.net>   2007-09-15
  24 #
  25 #  Based on libwget.py (2005-10-29)
  26 #
  27
  28 from __future__ import with_statement
  29
  30 from gpodder.liblogger import log
  31 from gpodder.libgpodder import gl
  32 from gpodder.dbsqlite import db
  33 from gpodder import util
  34 from gpodder import resolver
  35 import gpodder
  36
  37 import threading
  38 import urllib
  39 import shutil
  40 import os.path
  41 import os
  42 import time
  43 import collections
  44
  45 from xml.sax import saxutils
  46
  47
  48 class ContentRange(object):
  49     # Based on:
  50     # http://svn.pythonpaste.org/Paste/WebOb/trunk/webob/byterange.py
  51     #
  52     # Copyright (c) 2007 Ian Bicking and Contributors
  53     #
  54     # Permission is hereby granted, free of charge, to any person obtaining
  55     # a copy of this software and associated documentation files (the
  56     # "Software"), to deal in the Software without restriction, including
  57     # without limitation the rights to use, copy, modify, merge, publish,
  58     # distribute, sublicense, and/or sell copies of the Software, and to
  59     # permit persons to whom the Software is furnished to do so, subject to
  60     # the following conditions:
  61     #
  62     # The above copyright notice and this permission notice shall be
  63     # included in all copies or substantial portions of the Software.
  64     #
  65     # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  66     # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  67     # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  68     # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  69     # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  70     # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  71     # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  72     """
  73     Represents the Content-Range header
  74
  75     This header is ``start-stop/length``, where stop and length can be
  76     ``*`` (represented as None in the attributes).
  77     """
  78
  79     def __init__(self, start, stop, length):
  80         assert start >= 0, "Bad start: %r" % start
  81         assert stop is None or (stop >= 0 and stop >= start), (
  82             "Bad stop: %r" % stop)
  83         self.start = start
  84         self.stop = stop
  85         self.length = length
  86
  87     def __repr__(self):
  88         return '<%s %s>' % (
  89             self.__class__.__name__,
  90             self)
  91
  92     def __str__(self):
  93         if self.stop is None:
  94             stop = '*'
  95         else:
  96             stop = self.stop + 1
  97         if self.length is None:
  98             length = '*'
  99         else:
 100             length = self.length
 101         return 'bytes %s-%s/%s' % (self.start, stop, length)
 102
 103     def __iter__(self):
 104         """
 105         Mostly so you can unpack this, like:
 106
 107             start, stop, length = res.content_range
 108         """
 109         return iter([self.start, self.stop, self.length])
 110
 111     @classmethod
 112     def parse(cls, value):
 113         """
 114         Parse the header.  May return None if it cannot parse.
 115         """
 116         if value is None:
 117             return None
 118         value = value.strip()
 119         if not value.startswith('bytes '):
 120             # Unparseable
 121             return None
 122         value = value[len('bytes '):].strip()
 123         if '/' not in value:
 124             # Invalid, no length given
 125             return None
 126         range, length = value.split('/', 1)
 127         if '-' not in range:
 128             # Invalid, no range
 129             return None
 130         start, end = range.split('-', 1)
 131         try:
 132             start = int(start)
 133             if end == '*':
 134                 end = None
 135             else:
 136                 end = int(end)
 137             if length == '*':
 138                 length = None
 139             else:
 140                 length = int(length)
 141         except ValueError:
 142             # Parse problem
 143             return None
 144         if end is None:
 145             return cls(start, None, length)
 146         else:
 147             return cls(start, end-1, length)
 148
 149
 150 class DownloadCancelledException(Exception): pass
 151
 152 class gPodderDownloadHTTPError(Exception):
 153     def __init__(self, url, error_code, error_message):
 154         self.url = url
 155         self.error_code = error_code
 156         self.error_message = error_message
 157
 158 class DownloadURLOpener(urllib.FancyURLopener):
 159     version = gpodder.user_agent
 160
 161     def __init__( self, channel):
 162         if gl.config.proxy_use_environment:
 163             proxies = None
 164         else:
 165             proxies = {}
 166             if gl.config.http_proxy:
 167                 proxies['http'] = gl.config.http_proxy
 168             if gl.config.ftp_proxy:
 169                 proxies['ftp'] = gl.config.ftp_proxy
 170
 171         self.channel = channel
 172         urllib.FancyURLopener.__init__( self, proxies)
 173
 174     def http_error_default(self, url, fp, errcode, errmsg, headers):
 175         """
 176         FancyURLopener by default does not raise an exception when
 177         there is some unknown HTTP error code. We want to override
 178         this and provide a function to log the error and raise an
 179         exception, so we don't download the HTTP error page here.
 180         """
 181         # The following two lines are copied from urllib.URLopener's
 182         # implementation of http_error_default
 183         void = fp.read()
 184         fp.close()
 185         raise gPodderDownloadHTTPError(url, errcode, errmsg)
 186
 187 # The following is based on Python's urllib.py "URLopener.retrieve"
 188 # Also based on http://mail.python.org/pipermail/python-list/2001-October/110069.html
 189
 190     def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
 191         # The next line is taken from urllib's URLopener.open_http
 192         # method, at the end after the line "if errcode == 200:"
 193         return urllib.addinfourl(fp, headers, 'http:' + url)
 194
 195     def retrieve_resume(self, url, filename, reporthook=None, data=None):
 196         """retrieve_resume(url) returns (filename, headers) for a local object
 197         or (tempfilename, headers) for a remote object.
 198
 199         The filename argument is REQUIRED (no tempfile creation code here!)
 200
 201         Additionally resumes a download if the local filename exists"""
 202
 203         current_size = 0
 204         tfp = None
 205         if os.path.exists(filename):
 206             try:
 207                 current_size = os.path.getsize(filename)
 208                 tfp = open(filename, 'ab')
 209                 #If the file exists, then only download the remainder
 210                 self.addheader('Range', 'bytes=%s-' % (current_size))
 211             except:
 212                 log('Cannot open file for resuming: %s', filename, sender=self, traceback=True)
 213                 tfp = None
 214                 current_size = 0
 215
 216         if tfp is None:
 217             tfp = open(filename, 'wb')
 218
 219         url = urllib.unwrap(urllib.toBytes(url))
 220         fp = self.open(url, data)
 221         headers = fp.info()
 222
 223         if current_size > 0:
 224             # We told the server to resume - see if she agrees
 225             # See RFC2616 (206 Partial Content + Section 14.16)
 226             # XXX check status code here, too...
 227             range = ContentRange.parse(headers.get('content-range', ''))
 228             if range is None or range.start != current_size:
 229                 # Ok, that did not work. Reset the download
 230                 # TODO: seek and truncate if content-range differs from request
 231                 tfp.close()
 232                 tfp = open(filename, 'wb')
 233                 current_size = 0
 234                 log('Cannot resume. Missing or wrong Content-Range header (RFC2616)', sender=self)
 235
 236
 237         # gPodder TODO: we can get the real url via fp.geturl() here
 238         #               (if anybody wants to fix filenames in the future)
 239
 240         result = filename, headers
 241         bs = 1024*8
 242         size = -1
 243         read = current_size
 244         blocknum = int(current_size/bs)
 245         if reporthook:
 246             if "content-length" in headers:
 247                 size = int(headers["Content-Length"]) + current_size
 248             reporthook(blocknum, bs, size)
 249         while 1:
 250             block = fp.read(bs)
 251             if block == "":
 252                 break
 253             read += len(block)
 254             tfp.write(block)
 255             blocknum += 1
 256             if reporthook:
 257                 reporthook(blocknum, bs, size)
 258         fp.close()
 259         tfp.close()
 260         del fp
 261         del tfp
 262
 263         # raise exception if actual size does not match content-length header
 264         if size >= 0 and read < size:
 265             raise urllib.ContentTooShortError("retrieval incomplete: got only %i out "
 266                                        "of %i bytes" % (read, size), result)
 267
 268         return result
 269
 270 # end code based on urllib.py
 271
 272     def prompt_user_passwd( self, host, realm):
 273         if self.channel.username or self.channel.password:
 274             log( 'Authenticating as "%s" to "%s" for realm "%s".', self.channel.username, host, realm, sender = self)
 275             return ( self.channel.username, self.channel.password )
 276
 277         return ( None, None )
 278
 279
 280 class DownloadQueueWorker(threading.Thread):
 281     def __init__(self, queue, exit_callback):
 282         threading.Thread.__init__(self)
 283         self.queue = queue
 284         self.exit_callback = exit_callback
 285         self.cancelled = False
 286
 287     def stop_accepting_tasks(self):
 288         """
 289         When this is called, the worker will not accept new tasks,
 290         but quit when the current task has been finished.
 291         """
 292         if not self.cancelled:
 293             self.cancelled = True
 294             log('%s stopped accepting tasks.', self.getName(), sender=self)
 295
 296     def run(self):
 297         log('Running new thread: %s', self.getName(), sender=self)
 298         while not self.cancelled:
 299             try:
 300                 task = self.queue.pop()
 301                 log('%s is processing: %s', self.getName(), task, sender=self)
 302                 task.run()
 303             except IndexError, e:
 304                 log('No more tasks for %s to carry out.', self.getName(), sender=self)
 305                 break
 306         self.exit_callback(self)
 307
 308
 309 class DownloadQueueManager(object):
 310     def __init__(self, download_status_manager):
 311         self.download_status_manager = download_status_manager
 312         self.tasks = collections.deque()
 313
 314         self.worker_threads_access = threading.RLock()
 315         self.worker_threads = []
 316
 317     def __exit_callback(self, worker_thread):
 318         with self.worker_threads_access:
 319             self.worker_threads.remove(worker_thread)
 320
 321     def spawn_and_retire_threads(self, request_new_thread=False):
 322         with self.worker_threads_access:
 323             if len(self.worker_threads) > gl.config.max_downloads and \
 324                     gl.config.max_downloads_enabled:
 325                 # Tell the excessive amount of oldest worker threads to quit, but keep at least one
 326                 count = min(len(self.worker_threads)-1, len(self.worker_threads)-gl.config.max_downloads)
 327                 for worker in self.worker_threads[:count]:
 328                     worker.stop_accepting_tasks()
 329
 330             if request_new_thread and (len(self.worker_threads) == 0 or \
 331                     len(self.worker_threads) < gl.config.max_downloads or \
 332                     not gl.config.max_downloads_enabled):
 333                 # We have to create a new thread here, there's work to do
 334                 log('I am going to spawn a new worker thread.', sender=self)
 335                 worker = DownloadQueueWorker(self.tasks, self.__exit_callback)
 336                 self.worker_threads.append(worker)
 337                 worker.start()
 338
 339     def add_resumed_task(self, task):
 340         """Simply add the task without starting the download"""
 341         self.download_status_manager.register_task(task)
 342
 343     def add_task(self, task):
 344         if task.status == DownloadTask.INIT:
 345             # This task is fresh, so add it to our status manager
 346             self.download_status_manager.register_task(task)
 347         else:
 348             # This task is old so update episode from db
 349             task.episode.reload_from_db()
 350         task.status = DownloadTask.QUEUED
 351         self.tasks.appendleft(task)
 352         self.spawn_and_retire_threads(request_new_thread=True)
 353
 354
 355 class DownloadTask(object):
 356     """An object representing the download task of an episode
 357
 358     You can create a new download task like this:
 359
 360         task = DownloadTask(episode)
 361         task.status = DownloadTask.QUEUED
 362         task.run()
 363
 364     While the download is in progress, you can access its properties:
 365
 366         task.total_size       # in bytes
 367         task.progress         # from 0.0 to 1.0
 368         task.speed            # in bytes per second
 369         str(task)             # name of the episode
 370         task.status           # current status
 371         task.status_changed   # True if the status has been changed
 372
 373     You can cancel a running download task by setting its status:
 374
 375         task.status = DownloadTask.CANCELLED
 376
 377     The task will then abort as soon as possible (due to the nature
 378     of downloading data, this can take a while when the Internet is
 379     busy).
 380
 381     The "status_changed" attribute gets set to True everytime the
 382     "status" attribute changes its value. After you get the value of
 383     the "status_changed" attribute, it is always reset to False:
 384
 385         if task.status_changed:
 386             new_status = task.status
 387             # .. update the UI accordingly ..
 388
 389     Obviously, this also means that you must have at most *one*
 390     place in your UI code where you check for status changes and
 391     broadcast the status updates from there.
 392
 393     While the download is taking place and after the .run() method
 394     has finished, you can get the final status to check if the download
 395     was successful:
 396
 397         if task.status == DownloadTask.DONE:
 398             # .. everything ok ..
 399         elif task.status == DownloadTask.FAILED:
 400             # .. an error happened, and the
 401             #    error_message attribute is set ..
 402             print task.error_message
 403         elif task.status == DownloadTask.PAUSED:
 404             # .. user paused the download ..
 405         elif task.status == DownloadTask.CANCELLED:
 406             # .. user cancelled the download ..
 407
 408     The difference between cancelling and pausing a DownloadTask is
 409     that the temporary file gets deleted when cancelling, but does
 410     not get deleted when pausing.
 411
 412     Be sure to call .removed_from_list() on this task when removing
 413     it from the UI, so that it can carry out any pending clean-up
 414     actions (e.g. removing the temporary file when the task has not
 415     finished successfully; i.e. task.status != DownloadTask.DONE).
 416     """
 417     # Possible states this download task can be in
 418     STATUS_MESSAGE = (_('Added'), _('Queued'), _('Downloading'),
 419             _('Finished'), _('Failed'), _('Cancelled'), _('Paused'))
 420     (INIT, QUEUED, DOWNLOADING, DONE, FAILED, CANCELLED, PAUSED) = range(7)
 421
 422     def __str__(self):
 423         return self.__episode.title
 424
 425     def __get_status(self):
 426         return self.__status
 427
 428     def __set_status(self, status):
 429         if status != self.__status:
 430             self.__status_changed = True
 431             self.__status = status
 432
 433     status = property(fget=__get_status, fset=__set_status)
 434
 435     def __get_status_changed(self):
 436         if self.__status_changed:
 437             self.__status_changed = False
 438             return True
 439         else:
 440             return False
 441
 442     status_changed = property(fget=__get_status_changed)
 443
 444     def __get_url(self):
 445         return self.__episode.url
 446
 447     url = property(fget=__get_url)
 448
 449     def __get_episode(self):
 450         return self.__episode
 451
 452     episode = property(fget=__get_episode)
 453
 454     def removed_from_list(self):
 455         if self.status != self.DONE:
 456             util.delete_file(self.tempname)
 457
 458     def __init__(self, episode):
 459         self.__status = DownloadTask.INIT
 460         self.__status_changed = True
 461         self.__episode = episode
 462
 463         # Create the target filename and save it in the database
 464         self.filename = self.__episode.local_filename(create=True)
 465         self.tempname = self.filename + '.partial'
 466         db.commit()
 467
 468         self.total_size = self.__episode.length
 469         self.speed = 0.0
 470         self.progress = 0.0
 471         self.error_message = None
 472
 473         # Variables for speed limit and speed calculation
 474         self.__start_time = 0
 475         self.__start_blocks = 0
 476         self.__limit_rate_value = gl.config.limit_rate_value
 477         self.__limit_rate = gl.config.limit_rate
 478
 479         # If the tempname already exists, set progress accordingly
 480         if os.path.exists(self.tempname):
 481             try:
 482                 already_downloaded = os.path.getsize(self.tempname)
 483                 if self.total_size > 0:
 484                     self.progress = max(0.0, min(1.0, float(already_downloaded)/self.total_size))
 485             except OSError, os_error:
 486                 log('Error while getting size for existing file: %s', os_error, sender=self)
 487         else:
 488             # "touch self.tempname", so we also get partial
 489             # files for resuming when the file is queued
 490             open(self.tempname, 'w').close()
 491
 492     def status_updated(self, count, blockSize, totalSize):
 493         # We see a different "total size" while downloading,
 494         # so correct the total size variable in the thread
 495         if totalSize != self.total_size and totalSize > 0:
 496             self.total_size = float(totalSize)
 497
 498         if self.total_size > 0:
 499             self.progress = max(0.0, min(1.0, float(count*blockSize)/self.total_size))
 500
 501         self.calculate_speed(count, blockSize)
 502
 503         if self.status == DownloadTask.CANCELLED:
 504             raise DownloadCancelledException()
 505
 506         if self.status == DownloadTask.PAUSED:
 507             raise DownloadCancelledException()
 508
 509     def calculate_speed(self, count, blockSize):
 510         if count % 5 == 0:
 511             now = time.time()
 512             if self.__start_time > 0:
 513                 # Has rate limiting been enabled or disabled?
 514                 if self.__limit_rate != gl.config.limit_rate:
 515                     # If it has been enabled then reset base time and block count
 516                     if gl.config.limit_rate:
 517                         self.__start_time = now
 518                         self.__start_blocks = count
 519                     self.__limit_rate = gl.config.limit_rate
 520
 521                 # Has the rate been changed and are we currently limiting?
 522                 if self.__limit_rate_value != gl.config.limit_rate_value and self.__limit_rate:
 523                     self.__start_time = now
 524                     self.__start_blocks = count
 525                     self.__limit_rate_value = gl.config.limit_rate_value
 526
 527                 passed = now - self.__start_time
 528                 if passed > 0:
 529                     speed = ((count-self.__start_blocks)*blockSize)/passed
 530                 else:
 531                     speed = 0
 532             else:
 533                 self.__start_time = now
 534                 self.__start_blocks = count
 535                 passed = now - self.__start_time
 536                 speed = count*blockSize
 537
 538             self.speed = float(speed)
 539
 540             if gl.config.limit_rate and speed > gl.config.limit_rate_value:
 541                 # calculate the time that should have passed to reach
 542                 # the desired download rate and wait if necessary
 543                 should_have_passed = float((count-self.__start_blocks)*blockSize)/(gl.config.limit_rate_value*1024.0)
 544                 if should_have_passed > passed:
 545                     # sleep a maximum of 10 seconds to not cause time-outs
 546                     delay = min(10.0, float(should_have_passed-passed))
 547                     time.sleep(delay)
 548
 549     def run(self):
 550         # Speed calculation (re-)starts here
 551         self.__start_time = 0
 552         self.__start_blocks = 0
 553
 554         # If the download has already been cancelled, skip it
 555         if self.status == DownloadTask.CANCELLED:
 556             util.delete_file(self.tempname)
 557             return False
 558
 559         # We only start this download if its status is "queued"
 560         if self.status != DownloadTask.QUEUED:
 561             return False
 562
 563         # We are downloading this file right now
 564         self.status = DownloadTask.DOWNLOADING
 565
 566         try:
 567             # Resolve URL and start downloading the episode
 568             url = resolver.get_real_download_url(self.__episode.url)
 569             downloader =  DownloadURLOpener(self.__episode.channel)
 570             (unused, headers) = downloader.retrieve_resume(url,
 571                     self.tempname, reporthook=self.status_updated)
 572
 573             new_mimetype = headers.get('content-type', self.__episode.mimetype)
 574             old_mimetype = self.__episode.mimetype
 575             if new_mimetype != old_mimetype:
 576                 log('Correcting mime type: %s => %s', old_mimetype, new_mimetype, sender=self)
 577                 old_extension = self.__episode.extension()
 578                 self.__episode.mimetype = new_mimetype
 579                 new_extension = self.__episode.extension()
 580
 581                 # If the desired filename extension changed due to the new mimetype,
 582                 # we force an update of the local filename to fix the extension
 583                 if old_extension != new_extension:
 584                     self.filename = self.__episode.local_filename(create=True, force_update=True)
 585
 586             shutil.move(self.tempname, self.filename)
 587
 588             # Get the _real_ filesize once we actually have the file
 589             self.__episode.length = os.path.getsize(self.filename)
 590             self.__episode.channel.addDownloadedItem(self.__episode)
 591
 592             # If a user command has been defined, execute the command setting some environment variables
 593             if len(gl.config.cmd_download_complete) > 0:
 594                 os.environ["GPODDER_EPISODE_URL"]=self.__episode.url or ''
 595                 os.environ["GPODDER_EPISODE_TITLE"]=self.__episode.title or ''
 596                 os.environ["GPODDER_EPISODE_FILENAME"]=self.filename or ''
 597                 os.environ["GPODDER_EPISODE_PUBDATE"]=str(int(self.__episode.pubDate))
 598                 os.environ["GPODDER_EPISODE_LINK"]=self.__episode.link or ''
 599                 os.environ["GPODDER_EPISODE_DESC"]=self.__episode.description or ''
 600                 util.run_external_command(gl.config.cmd_download_complete)
 601         except DownloadCancelledException:
 602             log('Download has been cancelled/paused: %s', self, sender=self)
 603             if self.status == DownloadTask.CANCELLED:
 604                 util.delete_file(self.tempname)
 605                 self.progress = 0.0
 606                 self.speed = 0.0
 607         except IOError, ioe:
 608             log( 'Error "%s" while downloading "%s": %s', ioe.strerror, self.__episode.title, ioe.filename, sender=self)
 609             self.status = DownloadTask.FAILED
 610             self.error_message = _('I/O Error: %s: %s') % (ioe.strerror, ioe.filename)
 611         except gPodderDownloadHTTPError, gdhe:
 612             log( 'HTTP error %s while downloading "%s": %s', gdhe.error_code, self.__episode.title, gdhe.error_message, sender=self)
 613             self.status = DownloadTask.FAILED
 614             self.error_message = _('HTTP Error %s: %s') % (gdhe.error_code, gdhe.error_message)
 615         except Exception, e:
 616             self.status = DownloadTask.FAILED
 617             self.error_message = _('Error: %s') % (e.message,)
 618
 619         if self.status == DownloadTask.DOWNLOADING:
 620             # Everything went well - we're done
 621             self.status = DownloadTask.DONE
 622             self.progress = 1.0
 623             return True
 624
 625         self.speed = 0.0
 626
 627         # We finished, but not successfully (at least not really)
 628         return False
 629